git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* [PATCH 1/2] bundle: plug resource leak
@ 2016-03-01 23:35 Junio C Hamano
  2016-03-01 23:36 ` [PATCH 2/2] bundle: keep a copy of bundle file name in the in-core bundle header Junio C Hamano
  2016-03-02  8:54 ` [PATCH 1/2] bundle: plug resource leak Jeff King
  0 siblings, 2 replies; 37+ messages in thread
From: Junio C Hamano @ 2016-03-01 23:35 UTC (permalink / raw)
  To: git

The bundle header structure holds two lists of refs and object
names, which should be released when the user is done with it.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 bundle.c    | 12 ++++++++++++
 bundle.h    |  1 +
 transport.c |  1 +
 3 files changed, 14 insertions(+)

diff --git a/bundle.c b/bundle.c
index 506ac49..9c5a6f0 100644
--- a/bundle.c
+++ b/bundle.c
@@ -102,6 +102,18 @@ int is_bundle(const char *path, int quiet)
 	return (fd >= 0);
 }
 
+void release_bundle_header(struct bundle_header *header)
+{
+	int i;
+
+	for (i = 0; i < header->prerequisites.nr; i++)
+		free(header->prerequisites.list[i].name);
+	free(header->prerequisites.list);
+	for (i = 0; i < header->references.nr; i++)
+		free(header->references.list[i].name);
+	free(header->references.list);
+}
+
 static int list_refs(struct ref_list *r, int argc, const char **argv)
 {
 	int i;
diff --git a/bundle.h b/bundle.h
index 1584e4d..f7ce23b 100644
--- a/bundle.h
+++ b/bundle.h
@@ -23,5 +23,6 @@ int verify_bundle(struct bundle_header *header, int verbose);
 int unbundle(struct bundle_header *header, int bundle_fd, int flags);
 int list_bundle_refs(struct bundle_header *header,
 		int argc, const char **argv);
+void release_bundle_header(struct bundle_header *);
 
 #endif
diff --git a/transport.c b/transport.c
index ca3cfa4..08e15c5 100644
--- a/transport.c
+++ b/transport.c
@@ -107,6 +107,7 @@ static int close_bundle(struct transport *transport)
 	struct bundle_transport_data *data = transport->data;
 	if (data->fd > 0)
 		close(data->fd);
+	release_bundle_header(&data->header);
 	free(data);
 	return 0;
 }
-- 
2.8.0-rc0-114-g0b3e5e5

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 2/2] bundle: keep a copy of bundle file name in the in-core bundle header
  2016-03-01 23:35 [PATCH 1/2] bundle: plug resource leak Junio C Hamano
@ 2016-03-01 23:36 ` Junio C Hamano
  2016-03-02  9:01   ` Jeff King
  2016-03-02  8:54 ` [PATCH 1/2] bundle: plug resource leak Jeff King
  1 sibling, 1 reply; 37+ messages in thread
From: Junio C Hamano @ 2016-03-01 23:36 UTC (permalink / raw)
  To: git

This will be necessary when we start reading from a split bundle
where the header and the thin-pack data live in different files.

The in-core bundle header will read from a file that has the header,
and will record the path to that file.  We would find the name of
the file that hosts the thin-pack data from the header, and we would
take that name as relative to the file we read the header from.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/bundle.c |  5 +++--
 bundle.c         | 21 +++++++++++----------
 bundle.h         |  3 ++-
 transport.c      |  3 ++-
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/builtin/bundle.c b/builtin/bundle.c
index 4883a43..e63388d 100644
--- a/builtin/bundle.c
+++ b/builtin/bundle.c
@@ -36,8 +36,9 @@ int cmd_bundle(int argc, const char **argv, const char *prefix)
 	}
 
 	memset(&header, 0, sizeof(header));
-	if (strcmp(cmd, "create") && (bundle_fd =
-				read_bundle_header(bundle_file, &header)) < 0)
+	header.bundle_file = bundle_file;
+	if (strcmp(cmd, "create") &&
+	    (bundle_fd = read_bundle_header(&header)) < 0)
 		return 1;
 
 	if (!strcmp(cmd, "verify")) {
diff --git a/bundle.c b/bundle.c
index 9c5a6f0..efe19e0 100644
--- a/bundle.c
+++ b/bundle.c
@@ -21,8 +21,7 @@ static void add_to_ref_list(const unsigned char *sha1, const char *name,
 	list->nr++;
 }
 
-static int parse_bundle_header(int fd, struct bundle_header *header,
-			       const char *report_path)
+static int parse_bundle_header(int fd, struct bundle_header *header, int quiet)
 {
 	struct strbuf buf = STRBUF_INIT;
 	int status = 0;
@@ -30,9 +29,9 @@ static int parse_bundle_header(int fd, struct bundle_header *header,
 	/* The bundle header begins with the signature */
 	if (strbuf_getwholeline_fd(&buf, fd, '\n') ||
 	    strcmp(buf.buf, bundle_signature)) {
-		if (report_path)
+		if (!quiet)
 			error(_("'%s' does not look like a v2 bundle file"),
-			      report_path);
+			      header->bundle_file);
 		status = -1;
 		goto abort;
 	}
@@ -57,7 +56,7 @@ static int parse_bundle_header(int fd, struct bundle_header *header,
 		if (get_sha1_hex(buf.buf, sha1) ||
 		    (buf.len > 40 && !isspace(buf.buf[40])) ||
 		    (!is_prereq && buf.len <= 40)) {
-			if (report_path)
+			if (!quiet)
 				error(_("unrecognized header: %s%s (%d)"),
 				      (is_prereq ? "-" : ""), buf.buf, (int)buf.len);
 			status = -1;
@@ -79,13 +78,13 @@ static int parse_bundle_header(int fd, struct bundle_header *header,
 	return fd;
 }
 
-int read_bundle_header(const char *path, struct bundle_header *header)
+int read_bundle_header(struct bundle_header *header)
 {
-	int fd = open(path, O_RDONLY);
+	int fd = open(header->bundle_file, O_RDONLY);
 
 	if (fd < 0)
-		return error(_("could not open '%s'"), path);
-	return parse_bundle_header(fd, header, path);
+		return error(_("could not open '%s'"), header->bundle_file);
+	return parse_bundle_header(fd, header, 0);
 }
 
 int is_bundle(const char *path, int quiet)
@@ -96,7 +95,7 @@ int is_bundle(const char *path, int quiet)
 	if (fd < 0)
 		return 0;
 	memset(&header, 0, sizeof(header));
-	fd = parse_bundle_header(fd, &header, quiet ? NULL : path);
+	fd = parse_bundle_header(fd, &header, quiet);
 	if (fd >= 0)
 		close(fd);
 	return (fd >= 0);
@@ -112,6 +111,8 @@ void release_bundle_header(struct bundle_header *header)
 	for (i = 0; i < header->references.nr; i++)
 		free(header->references.list[i].name);
 	free(header->references.list);
+
+	free((void *)header->bundle_file);
 }
 
 static int list_refs(struct ref_list *r, int argc, const char **argv)
diff --git a/bundle.h b/bundle.h
index f7ce23b..cf771df 100644
--- a/bundle.h
+++ b/bundle.h
@@ -10,12 +10,13 @@ struct ref_list {
 };
 
 struct bundle_header {
+	const char *bundle_file;
 	struct ref_list prerequisites;
 	struct ref_list references;
 };
 
 int is_bundle(const char *path, int quiet);
-int read_bundle_header(const char *path, struct bundle_header *header);
+int read_bundle_header(struct bundle_header *header);
 int create_bundle(struct bundle_header *header, const char *path,
 		int argc, const char **argv);
 int verify_bundle(struct bundle_header *header, int verbose);
diff --git a/transport.c b/transport.c
index 08e15c5..03ce149 100644
--- a/transport.c
+++ b/transport.c
@@ -81,7 +81,8 @@ static struct ref *get_refs_from_bundle(struct transport *transport, int for_pus
 
 	if (data->fd > 0)
 		close(data->fd);
-	data->fd = read_bundle_header(transport->url, &data->header);
+	data->header.bundle_file = xstrdup(transport->url);
+	data->fd = read_bundle_header(&data->header);
 	if (data->fd < 0)
 		die ("Could not read bundle '%s'.", transport->url);
 	for (i = 0; i < data->header.references.nr; i++) {
-- 
2.8.0-rc0-114-g0b3e5e5

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/2] bundle: plug resource leak
  2016-03-01 23:35 [PATCH 1/2] bundle: plug resource leak Junio C Hamano
  2016-03-01 23:36 ` [PATCH 2/2] bundle: keep a copy of bundle file name in the in-core bundle header Junio C Hamano
@ 2016-03-02  8:54 ` Jeff King
  2016-03-02  9:00   ` Junio C Hamano
  1 sibling, 1 reply; 37+ messages in thread
From: Jeff King @ 2016-03-02  8:54 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, Mar 01, 2016 at 03:35:34PM -0800, Junio C Hamano wrote:

> The bundle header structure holds two lists of refs and object
> names, which should be released when the user is done with it.
> 
> Signed-off-by: Junio C Hamano <gitster@pobox.com>
> ---
>  bundle.c    | 12 ++++++++++++
>  bundle.h    |  1 +
>  transport.c |  1 +
>  3 files changed, 14 insertions(+)
> 
> diff --git a/bundle.c b/bundle.c
> index 506ac49..9c5a6f0 100644
> --- a/bundle.c
> +++ b/bundle.c
> @@ -102,6 +102,18 @@ int is_bundle(const char *path, int quiet)
>  	return (fd >= 0);
>  }
>  
> +void release_bundle_header(struct bundle_header *header)
> +{
> +	int i;
> +
> +	for (i = 0; i < header->prerequisites.nr; i++)
> +		free(header->prerequisites.list[i].name);
> +	free(header->prerequisites.list);
> +	for (i = 0; i < header->references.nr; i++)
> +		free(header->references.list[i].name);
> +	free(header->references.list);
> +}

Looks good. It's probably not worth adding a release_ref_list() to
handle the repeated data structures.

I do find it hard to believe that the bundle code had to invent its own
ref storage data structure, and couldn't just use "struct ref" like all
of the other code. It doesn't look like we ever sort it or do
non-sequential access. The linked-list "struct ref" probably would have
been fine.

Not a problem you are introducing, of course, but if you are touching
this code a lot, it might be worth seeing how painful it is.

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/2] bundle: plug resource leak
  2016-03-02  8:54 ` [PATCH 1/2] bundle: plug resource leak Jeff King
@ 2016-03-02  9:00   ` Junio C Hamano
  2016-03-02  9:02     ` Jeff King
  0 siblings, 1 reply; 37+ messages in thread
From: Junio C Hamano @ 2016-03-02  9:00 UTC (permalink / raw)
  To: Jeff King; +Cc: git

Jeff King <peff@peff.net> writes:

> I do find it hard to believe that the bundle code had to invent its own
> ref storage data structure, and couldn't just use "struct ref" like all
> of the other code. It doesn't look like we ever sort it or do
> non-sequential access. The linked-list "struct ref" probably would have
> been fine.
>
> Not a problem you are introducing, of course, but if you are touching
> this code a lot, it might be worth seeing how painful it is.

The bundle code being fairly old, I actually wouldn't be surprised
if it predated the wide use of "struct ref" ;-)

It is not performance critical to add entries to the list of
prerequisites or references (i.e. it is OK to have these as linear
lists, not linked lists of "struct ref"), and these lists do not
have to be ultra-efficient in their storage use (i.e. it is OK to
replace these with "struct ref" linked lists), so we could go either
way.  It's not like we would be using a lot of helper functions we
already have for "struct ref" in this code, so I'm inclined to give
a very low priority to the task of rethinking this data structure.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/2] bundle: keep a copy of bundle file name in the in-core bundle header
  2016-03-01 23:36 ` [PATCH 2/2] bundle: keep a copy of bundle file name in the in-core bundle header Junio C Hamano
@ 2016-03-02  9:01   ` Jeff King
  2016-03-02 18:15     ` Junio C Hamano
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff King @ 2016-03-02  9:01 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, Mar 01, 2016 at 03:36:26PM -0800, Junio C Hamano wrote:

> This will be necessary when we start reading from a split bundle
> where the header and the thin-pack data live in different files.
> 
> The in-core bundle header will read from a file that has the header,
> and will record the path to that file.  We would find the name of
> the file that hosts the thin-pack data from the header, and we would
> take that name as relative to the file we read the header from.

Neat. I'm hoping this means you're working on split bundles. :)

> diff --git a/builtin/bundle.c b/builtin/bundle.c
> index 4883a43..e63388d 100644
> --- a/builtin/bundle.c
> +++ b/builtin/bundle.c
> @@ -36,8 +36,9 @@ int cmd_bundle(int argc, const char **argv, const char *prefix)
>  	}
>  
>  	memset(&header, 0, sizeof(header));
> -	if (strcmp(cmd, "create") && (bundle_fd =
> -				read_bundle_header(bundle_file, &header)) < 0)
> +	header.bundle_file = bundle_file;

What are the memory ownership rules for header.bundle_file?

Here you assign from either an argv parameter or a stack buffer, and
here...

> @@ -112,6 +111,8 @@ void release_bundle_header(struct bundle_header *header)
>  	for (i = 0; i < header->references.nr; i++)
>  		free(header->references.list[i].name);
>  	free(header->references.list);
> +
> +	free((void *)header->bundle_file);
>  }

You free it.

The call in get_refs_from_bundle does do an xstrdup().

Should we have:

  void init_bundle_header(struct bundle_header *header, const char *file)
  {
	memset(header, 0, sizeof(*header));
	header.bundle_file = xstrdup(file);
  }

to abstract the whole procedure?

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/2] bundle: plug resource leak
  2016-03-02  9:00   ` Junio C Hamano
@ 2016-03-02  9:02     ` Jeff King
  0 siblings, 0 replies; 37+ messages in thread
From: Jeff King @ 2016-03-02  9:02 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Wed, Mar 02, 2016 at 01:00:38AM -0800, Junio C Hamano wrote:

> Jeff King <peff@peff.net> writes:
> 
> > I do find it hard to believe that the bundle code had to invent its own
> > ref storage data structure, and couldn't just use "struct ref" like all
> > of the other code. It doesn't look like we ever sort it or do
> > non-sequential access. The linked-list "struct ref" probably would have
> > been fine.
> >
> > Not a problem you are introducing, of course, but if you are touching
> > this code a lot, it might be worth seeing how painful it is.
> 
> The bundle code being fairly old, I actually wouldn't be surprised
> if it predated the wide use of "struct ref" ;-)
> 
> It is not performance critical to add entries to the list of
> prerequisites or references (i.e. it is OK to have these as linear
> lists, not linked lists of "struct ref"), and these lists do not
> have to be ultra-efficient in their storage use (i.e. it is OK to
> replace these with "struct ref" linked lists), so we could go either
> way.  It's not like we would be using a lot of helper functions we
> already have for "struct ref" in this code, so I'm inclined to give
> a very low priority to the task of rethinking this data structure.

Sure, I agree it's low priority by itself. It was more something to
consider if you find that you are touching the bundle code a lot.

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/2] bundle: keep a copy of bundle file name in the in-core bundle header
  2016-03-02  9:01   ` Jeff King
@ 2016-03-02 18:15     ` Junio C Hamano
  2016-03-02 20:32       ` [PATCH v2 0/4] "split bundle" preview Junio C Hamano
  0 siblings, 1 reply; 37+ messages in thread
From: Junio C Hamano @ 2016-03-02 18:15 UTC (permalink / raw)
  To: Jeff King; +Cc: git

Jeff King <peff@peff.net> writes:

> On Tue, Mar 01, 2016 at 03:36:26PM -0800, Junio C Hamano wrote:
>
>> This will be necessary when we start reading from a split bundle
>> where the header and the thin-pack data live in different files.
>> 
>> The in-core bundle header will read from a file that has the header,
>> and will record the path to that file.  We would find the name of
>> the file that hosts the thin-pack data from the header, and we would
>> take that name as relative to the file we read the header from.
>
> Neat. I'm hoping this means you're working on split bundles. :)

Let's just say that during the -rc freeze period, because I can stop
looking at or queuing completely new topics to encourage people who
are responsible for topics in the upcoming release to focus more on
responding to regressions and follow-up fixes necessary, I have a
better chance of having some leftover time to look into things
myself, at least enough to figure out what needs to be done, ;-)

>> -	if (strcmp(cmd, "create") && (bundle_fd =
>> -				read_bundle_header(bundle_file, &header)) < 0)
>> +	header.bundle_file = bundle_file;
>
> What are the memory ownership rules for header.bundle_file?
>
> Here you assign from either an argv parameter or a stack buffer, and
> here...
>
>> @@ -112,6 +111,8 @@ void release_bundle_header(struct bundle_header *header)
>>  	for (i = 0; i < header->references.nr; i++)
>>  		free(header->references.list[i].name);
>>  	free(header->references.list);
>> +
>> +	free((void *)header->bundle_file);
>>  }
>
> You free it.
>
> The call in get_refs_from_bundle does do an xstrdup().

Good eyes.

>
> Should we have:
>
>   void init_bundle_header(struct bundle_header *header, const char *file)
>   {
> 	memset(header, 0, sizeof(*header));
> 	header.bundle_file = xstrdup(file);
>   }
>
> to abstract the whole procedure?

Maybe, maybe not.  I'll decide after adding the bundle_version field
to the structure (which will be read from an existing bundle, but
which will have to be set for a bundle being created).

Thanks.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* [PATCH v2 0/4] "split bundle" preview
  2016-03-02 18:15     ` Junio C Hamano
@ 2016-03-02 20:32       ` Junio C Hamano
  2016-03-02 20:32         ` [PATCH v2 1/4] bundle doc: 'verify' is not about verifying the bundle Junio C Hamano
                           ` (3 more replies)
  0 siblings, 4 replies; 37+ messages in thread
From: Junio C Hamano @ 2016-03-02 20:32 UTC (permalink / raw)
  To: git; +Cc: Jeff King

Here is a preview of the "split bundle" stuff that may serve as one
of the enabling technology to offload "git clone" traffic off of the
server core network to CDN.

Changes:

 - The "checksum" bit about the in-bundle packdata, which was
   incorrect, was dropped from the proposed log message of 1/4.

 - "init_bundle_header()" helper has been added to 3/4; the name of
   the new field in bundle_header structure is now "filename", not
   "bundle_file".  It is silly to name a field with "bundle" in it
   when the structure is about a bundle already.

 - 4/4 is new, and implements the unbundling part, i.e. running
   either "git clone" or "git bundle unbundle" on the two files
   after a split bundle is made available locally.

Junio C Hamano (4):
  bundle doc: 'verify' is not about verifying the bundle
  bundle: plug resource leak
  bundle: keep a copy of bundle file name in the in-core bundle header
  bundle v3: the beginning

 Documentation/git-bundle.txt |   9 ++-
 builtin/bundle.c             |   6 +-
 bundle.c                     | 142 +++++++++++++++++++++++++++++++++++++------
 bundle.h                     |   8 ++-
 t/t5704-bundle.sh            |  64 +++++++++++++++++++
 transport.c                  |   4 +-
 6 files changed, 204 insertions(+), 29 deletions(-)

-- 
2.8.0-rc0-114-g0b3e5e5

^ permalink raw reply	[flat|nested] 37+ messages in thread

* [PATCH v2 1/4] bundle doc: 'verify' is not about verifying the bundle
  2016-03-02 20:32       ` [PATCH v2 0/4] "split bundle" preview Junio C Hamano
@ 2016-03-02 20:32         ` Junio C Hamano
  2016-03-02 20:32         ` [PATCH v2 2/4] bundle: plug resource leak Junio C Hamano
                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 37+ messages in thread
From: Junio C Hamano @ 2016-03-02 20:32 UTC (permalink / raw)
  To: git

Even though the command does read the bundle header and checks to
see if it looks reasonable, the thin-pack data stream that follows
the header in the bundle file is not checked.

The documentation gives an incorrect impression that the data
contained in the bundle is validated, but the command is to validate
that the receiving repository is ready to accept the bundle, not to
check the validity of a bundle file itself.

Rephrase the paragraph to clarify this.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 Documentation/git-bundle.txt | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/Documentation/git-bundle.txt b/Documentation/git-bundle.txt
index 3a8120c..c0113a8 100644
--- a/Documentation/git-bundle.txt
+++ b/Documentation/git-bundle.txt
@@ -38,11 +38,10 @@ create <file>::
 	'git-rev-list-args' arguments to define the bundle contents.
 
 verify <file>::
-	Used to check that a bundle file is valid and will apply
-	cleanly to the current repository.  This includes checks on the
-	bundle format itself as well as checking that the prerequisite
-	commits exist and are fully linked in the current repository.
-	'git bundle' prints a list of missing commits, if any, and exits
+	Verifies that the given 'file' has a valid-looking bundle
+	header, and that your repository has all prerequisite
+	objects necessary to unpack the file as a bundle.  The
+	command prints a list of missing commits, if any, and exits
 	with a non-zero status.
 
 list-heads <file>::
-- 
2.8.0-rc0-114-g0b3e5e5

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH v2 2/4] bundle: plug resource leak
  2016-03-02 20:32       ` [PATCH v2 0/4] "split bundle" preview Junio C Hamano
  2016-03-02 20:32         ` [PATCH v2 1/4] bundle doc: 'verify' is not about verifying the bundle Junio C Hamano
@ 2016-03-02 20:32         ` Junio C Hamano
  2016-03-02 20:32         ` [PATCH v2 3/4] bundle: keep a copy of bundle file name in the in-core bundle header Junio C Hamano
  2016-03-02 20:32         ` [PATCH v2 4/4] bundle v3: the beginning Junio C Hamano
  3 siblings, 0 replies; 37+ messages in thread
From: Junio C Hamano @ 2016-03-02 20:32 UTC (permalink / raw)
  To: git

The bundle header structure holds two lists of refs and object
names, which should be released when the user is done with it.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 bundle.c    | 12 ++++++++++++
 bundle.h    |  1 +
 transport.c |  1 +
 3 files changed, 14 insertions(+)

diff --git a/bundle.c b/bundle.c
index 506ac49..9c5a6f0 100644
--- a/bundle.c
+++ b/bundle.c
@@ -102,6 +102,18 @@ int is_bundle(const char *path, int quiet)
 	return (fd >= 0);
 }
 
+void release_bundle_header(struct bundle_header *header)
+{
+	int i;
+
+	for (i = 0; i < header->prerequisites.nr; i++)
+		free(header->prerequisites.list[i].name);
+	free(header->prerequisites.list);
+	for (i = 0; i < header->references.nr; i++)
+		free(header->references.list[i].name);
+	free(header->references.list);
+}
+
 static int list_refs(struct ref_list *r, int argc, const char **argv)
 {
 	int i;
diff --git a/bundle.h b/bundle.h
index 1584e4d..f7ce23b 100644
--- a/bundle.h
+++ b/bundle.h
@@ -23,5 +23,6 @@ int verify_bundle(struct bundle_header *header, int verbose);
 int unbundle(struct bundle_header *header, int bundle_fd, int flags);
 int list_bundle_refs(struct bundle_header *header,
 		int argc, const char **argv);
+void release_bundle_header(struct bundle_header *);
 
 #endif
diff --git a/transport.c b/transport.c
index ca3cfa4..08e15c5 100644
--- a/transport.c
+++ b/transport.c
@@ -107,6 +107,7 @@ static int close_bundle(struct transport *transport)
 	struct bundle_transport_data *data = transport->data;
 	if (data->fd > 0)
 		close(data->fd);
+	release_bundle_header(&data->header);
 	free(data);
 	return 0;
 }
-- 
2.8.0-rc0-114-g0b3e5e5

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH v2 3/4] bundle: keep a copy of bundle file name in the in-core bundle header
  2016-03-02 20:32       ` [PATCH v2 0/4] "split bundle" preview Junio C Hamano
  2016-03-02 20:32         ` [PATCH v2 1/4] bundle doc: 'verify' is not about verifying the bundle Junio C Hamano
  2016-03-02 20:32         ` [PATCH v2 2/4] bundle: plug resource leak Junio C Hamano
@ 2016-03-02 20:32         ` Junio C Hamano
  2016-03-02 20:49           ` Jeff King
  2016-03-02 20:32         ` [PATCH v2 4/4] bundle v3: the beginning Junio C Hamano
  3 siblings, 1 reply; 37+ messages in thread
From: Junio C Hamano @ 2016-03-02 20:32 UTC (permalink / raw)
  To: git

This will be necessary when we start reading from a split bundle
where the header and the thin-pack data live in different files.

The in-core bundle header will read from a file that has the header,
and will record the path to that file.  We would find the name of
the file that hosts the thin-pack data from the header, and we would
take that name as relative to the file we read the header from.

Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/bundle.c |  6 +++---
 bundle.c         | 27 +++++++++++++++++----------
 bundle.h         |  4 +++-
 transport.c      |  3 ++-
 4 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/builtin/bundle.c b/builtin/bundle.c
index 4883a43..c9ede65 100644
--- a/builtin/bundle.c
+++ b/builtin/bundle.c
@@ -35,9 +35,9 @@ int cmd_bundle(int argc, const char **argv, const char *prefix)
 		bundle_file = buffer;
 	}
 
-	memset(&header, 0, sizeof(header));
-	if (strcmp(cmd, "create") && (bundle_fd =
-				read_bundle_header(bundle_file, &header)) < 0)
+	init_bundle_header(&header, bundle_file);
+	if (strcmp(cmd, "create") &&
+	    (bundle_fd = read_bundle_header(&header)) < 0)
 		return 1;
 
 	if (!strcmp(cmd, "verify")) {
diff --git a/bundle.c b/bundle.c
index 9c5a6f0..32bdb01 100644
--- a/bundle.c
+++ b/bundle.c
@@ -21,8 +21,13 @@ static void add_to_ref_list(const unsigned char *sha1, const char *name,
 	list->nr++;
 }
 
-static int parse_bundle_header(int fd, struct bundle_header *header,
-			       const char *report_path)
+void init_bundle_header(struct bundle_header *header, const char *name)
+{
+	memset(header, '\0', sizeof(*header));
+	header->filename = xstrdup(name);
+}
+
+static int parse_bundle_header(int fd, struct bundle_header *header, int quiet)
 {
 	struct strbuf buf = STRBUF_INIT;
 	int status = 0;
@@ -30,9 +35,9 @@ static int parse_bundle_header(int fd, struct bundle_header *header,
 	/* The bundle header begins with the signature */
 	if (strbuf_getwholeline_fd(&buf, fd, '\n') ||
 	    strcmp(buf.buf, bundle_signature)) {
-		if (report_path)
+		if (!quiet)
 			error(_("'%s' does not look like a v2 bundle file"),
-			      report_path);
+			      header->filename);
 		status = -1;
 		goto abort;
 	}
@@ -57,7 +62,7 @@ static int parse_bundle_header(int fd, struct bundle_header *header,
 		if (get_sha1_hex(buf.buf, sha1) ||
 		    (buf.len > 40 && !isspace(buf.buf[40])) ||
 		    (!is_prereq && buf.len <= 40)) {
-			if (report_path)
+			if (!quiet)
 				error(_("unrecognized header: %s%s (%d)"),
 				      (is_prereq ? "-" : ""), buf.buf, (int)buf.len);
 			status = -1;
@@ -79,13 +84,13 @@ static int parse_bundle_header(int fd, struct bundle_header *header,
 	return fd;
 }
 
-int read_bundle_header(const char *path, struct bundle_header *header)
+int read_bundle_header(struct bundle_header *header)
 {
-	int fd = open(path, O_RDONLY);
+	int fd = open(header->filename, O_RDONLY);
 
 	if (fd < 0)
-		return error(_("could not open '%s'"), path);
-	return parse_bundle_header(fd, header, path);
+		return error(_("could not open '%s'"), header->filename);
+	return parse_bundle_header(fd, header, 0);
 }
 
 int is_bundle(const char *path, int quiet)
@@ -96,7 +101,7 @@ int is_bundle(const char *path, int quiet)
 	if (fd < 0)
 		return 0;
 	memset(&header, 0, sizeof(header));
-	fd = parse_bundle_header(fd, &header, quiet ? NULL : path);
+	fd = parse_bundle_header(fd, &header, quiet);
 	if (fd >= 0)
 		close(fd);
 	return (fd >= 0);
@@ -112,6 +117,8 @@ void release_bundle_header(struct bundle_header *header)
 	for (i = 0; i < header->references.nr; i++)
 		free(header->references.list[i].name);
 	free(header->references.list);
+
+	free((void *)header->filename);
 }
 
 static int list_refs(struct ref_list *r, int argc, const char **argv)
diff --git a/bundle.h b/bundle.h
index f7ce23b..e059ccf 100644
--- a/bundle.h
+++ b/bundle.h
@@ -10,12 +10,14 @@ struct ref_list {
 };
 
 struct bundle_header {
+	const char *filename;
 	struct ref_list prerequisites;
 	struct ref_list references;
 };
 
 int is_bundle(const char *path, int quiet);
-int read_bundle_header(const char *path, struct bundle_header *header);
+void init_bundle_header(struct bundle_header *, const char *filename);
+int read_bundle_header(struct bundle_header *header);
 int create_bundle(struct bundle_header *header, const char *path,
 		int argc, const char **argv);
 int verify_bundle(struct bundle_header *header, int verbose);
diff --git a/transport.c b/transport.c
index 08e15c5..7bd3206 100644
--- a/transport.c
+++ b/transport.c
@@ -81,7 +81,8 @@ static struct ref *get_refs_from_bundle(struct transport *transport, int for_pus
 
 	if (data->fd > 0)
 		close(data->fd);
-	data->fd = read_bundle_header(transport->url, &data->header);
+	init_bundle_header(&data->header, transport->url);
+	data->fd = read_bundle_header(&data->header);
 	if (data->fd < 0)
 		die ("Could not read bundle '%s'.", transport->url);
 	for (i = 0; i < data->header.references.nr; i++) {
-- 
2.8.0-rc0-114-g0b3e5e5

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH v2 4/4] bundle v3: the beginning
  2016-03-02 20:32       ` [PATCH v2 0/4] "split bundle" preview Junio C Hamano
                           ` (2 preceding siblings ...)
  2016-03-02 20:32         ` [PATCH v2 3/4] bundle: keep a copy of bundle file name in the in-core bundle header Junio C Hamano
@ 2016-03-02 20:32         ` Junio C Hamano
  2016-03-03  1:36           ` Duy Nguyen
  2016-05-20 12:39           ` Christian Couder
  3 siblings, 2 replies; 37+ messages in thread
From: Junio C Hamano @ 2016-03-02 20:32 UTC (permalink / raw)
  To: git

The bundle v3 format introduces an ability to have the bundle header
(which describes what references in the bundled history can be
fetched, and what objects the receiving repository must have in
order to unbundle it successfully) in one file, and the bundled pack
stream data in a separate file.

A v3 bundle file begins with a line with "# v3 git bundle", followed
by zero or more "extended header" lines, and an empty line, finally
followed by the list of prerequisites and references in the same
format as v2 bundle.  If it uses the "split bundle" feature, there
is a "data: $URL" extended header line, and nothing follows the list
of prerequisites and references.  Also, "sha1: " extended header
line may exist to help validating that the pack stream data matches
the bundle header.

A typical expected use of a split bundle is to help initial clone
that involves a huge data transfer, and would go like this:

 - Any repository people would clone and fetch from would regularly
   be repacked, and it is expected that there would be a packfile
   without prerequisites that holds all (or at least most) of the
   history of it (call it pack-$name.pack).

 - After arranging that packfile to be downloadable over popular
   transfer methods used for serving static files (such as HTTP or
   HTTPS) that are easily resumable as $URL/pack-$name.pack, a v3
   bundle file (call it $name.bndl) can be prepared with an extended
   header "data: $URL/pack-$name.pack" to point at the download
   location for the packfile, and be served at "$URL/$name.bndl".

 - An updated Git client, when trying to "git clone" from such a
   repository, may be redirected to $URL/$name.bndl", which would be
   a tiny text file (when split bundle feature is used).

 - The client would then inspect the downloaded $name.bndl, learn
   that the corresponding packfile exists at $URL/pack-$name.pack,
   and downloads it as pack-$name.pack, until the download succeeds.
   This can easily be done with "wget --continue" equivalent over an
   unreliable link.  The checksum recorded on the "sha1: " header
   line is expected to be used by this downloader (not written yet).

 - After fully downloading $name.bndl and pack-$name.pack and
   storing them next to each other, the client would clone from the
   $name.bndl; this would populate the newly created repository with
   reasonably recent history.

 - Then the client can issue "git fetch" against the original
   repository to obtain the most recent part of the history created
   since the bundle was made.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 bundle.c          | 103 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 bundle.h          |   3 ++
 t/t5704-bundle.sh |  64 +++++++++++++++++++++++++++++++++
 3 files changed, 161 insertions(+), 9 deletions(-)

diff --git a/bundle.c b/bundle.c
index 32bdb01..480630d 100644
--- a/bundle.c
+++ b/bundle.c
@@ -10,7 +10,8 @@
 #include "refs.h"
 #include "argv-array.h"
 
-static const char bundle_signature[] = "# v2 git bundle\n";
+static const char bundle_signature_v2[] = "# v2 git bundle\n";
+static const char bundle_signature_v3[] = "# v3 git bundle\n";
 
 static void add_to_ref_list(const unsigned char *sha1, const char *name,
 		struct ref_list *list)
@@ -33,16 +34,55 @@ static int parse_bundle_header(int fd, struct bundle_header *header, int quiet)
 	int status = 0;
 
 	/* The bundle header begins with the signature */
-	if (strbuf_getwholeline_fd(&buf, fd, '\n') ||
-	    strcmp(buf.buf, bundle_signature)) {
+	if (strbuf_getwholeline_fd(&buf, fd, '\n')) {
+	bad_bundle:
 		if (!quiet)
-			error(_("'%s' does not look like a v2 bundle file"),
+			error(_("'%s' does not look like a supported bundle file"),
 			      header->filename);
 		status = -1;
 		goto abort;
 	}
 
-	/* The bundle header ends with an empty line */
+	if (!strcmp(buf.buf, bundle_signature_v2))
+		header->bundle_version = 2;
+	else if (!strcmp(buf.buf, bundle_signature_v3))
+		header->bundle_version = 3;
+	else
+		goto bad_bundle;
+
+	if (header->bundle_version == 3) {
+		/*
+		 * bundle version v3 has extended headers before the
+		 * list of prerequisites and references.  The extended
+		 * headers end with an empty line.
+		 */
+		while (!strbuf_getwholeline_fd(&buf, fd, '\n')) {
+			const char *cp;
+			if (buf.len && buf.buf[buf.len - 1] == '\n')
+				buf.buf[--buf.len] = '\0';
+			if (!buf.len)
+				break;
+			if (skip_prefix(buf.buf, "data: ", &cp)) {
+				header->datafile = xstrdup(cp);
+				continue;
+			}
+			if (skip_prefix(buf.buf, "sha1: ", &cp)) {
+				unsigned char sha1[GIT_SHA1_RAWSZ];
+				if (get_sha1_hex(cp, sha1) ||
+				    cp[GIT_SHA1_HEXSZ])
+					goto bad_bundle;
+				hashcpy(header->csum, sha1);
+				continue;
+			}
+
+			goto bad_bundle;
+		}
+	}
+
+	/*
+	 * The bundle header lists prerequisites and
+	 * references, and the list ends with an empty line.
+	 */
 	while (!strbuf_getwholeline_fd(&buf, fd, '\n') &&
 	       buf.len && buf.buf[0] != '\n') {
 		unsigned char sha1[20];
@@ -77,7 +117,8 @@ static int parse_bundle_header(int fd, struct bundle_header *header, int quiet)
 
  abort:
 	if (status) {
-		close(fd);
+		if (0 <= fd)
+			close(fd);
 		fd = -1;
 	}
 	strbuf_release(&buf);
@@ -426,6 +467,7 @@ int create_bundle(struct bundle_header *header, const char *path,
 	int bundle_to_stdout;
 	int ref_count = 0;
 	struct rev_info revs;
+	const char *bundle_signature = bundle_signature_v2;
 
 	bundle_to_stdout = !strcmp(path, "-");
 	if (bundle_to_stdout)
@@ -480,22 +522,65 @@ int create_bundle(struct bundle_header *header, const char *path,
 	return 0;
 }
 
+/*
+ * v3 "split bundle" allows a separate packfile to be named
+ * as "data: $URL/$name_of_the_packfile".  This file is expected
+ * to be downloaded next to the bundle header file when the
+ * bundle is used.  Hence we find the path to the directory
+ * that contains the bundle header file, and append the basename
+ * part of the bundle_data_file to it, to form the name of the
+ * file that holds the pack data stream.
+ */
+static int open_bundle_data(struct bundle_header *header)
+{
+	const char *cp;
+	struct strbuf filename = STRBUF_INIT;
+	int fd;
+
+	assert(header->datafile);
+
+	cp = find_last_dir_sep(header->filename);
+	if (cp)
+		strbuf_add(&filename, header->filename,
+			   (cp - header->filename) + 1);
+	cp = find_last_dir_sep(header->datafile);
+	if (!cp)
+		cp = header->datafile;
+	strbuf_addstr(&filename, cp);
+
+	fd = open(filename.buf, O_RDONLY);
+	strbuf_release(&filename);
+	return fd;
+}
+
 int unbundle(struct bundle_header *header, int bundle_fd, int flags)
 {
 	const char *argv_index_pack[] = {"index-pack",
 					 "--fix-thin", "--stdin", NULL, NULL};
 	struct child_process ip = CHILD_PROCESS_INIT;
+	int status = 0, data_fd = -1;
 
 	if (flags & BUNDLE_VERBOSE)
 		argv_index_pack[3] = "-v";
 
 	if (verify_bundle(header, 0))
 		return -1;
+
+	if (header->datafile) {
+		data_fd = open_bundle_data(header);
+		if (data_fd < 0)
+			return error(_("bundle data not found"));
+		ip.in = data_fd;
+	} else {
+		ip.in = bundle_fd;
+	}
+
 	ip.argv = argv_index_pack;
-	ip.in = bundle_fd;
 	ip.no_stdout = 1;
 	ip.git_cmd = 1;
 	if (run_command(&ip))
-		return error(_("index-pack died"));
-	return 0;
+		status = error(_("index-pack died"));
+	if (0 <= data_fd)
+		close(data_fd);
+	return status;
 }
diff --git a/bundle.h b/bundle.h
index e059ccf..db55dc7 100644
--- a/bundle.h
+++ b/bundle.h
@@ -10,7 +10,10 @@ struct ref_list {
 };
 
 struct bundle_header {
+	int bundle_version;
 	const char *filename;
+	const char *datafile;
+	unsigned char csum[GIT_SHA1_RAWSZ];
 	struct ref_list prerequisites;
 	struct ref_list references;
 };
diff --git a/t/t5704-bundle.sh b/t/t5704-bundle.sh
index 348d9b3..e68523c 100755
--- a/t/t5704-bundle.sh
+++ b/t/t5704-bundle.sh
@@ -71,4 +71,68 @@ test_expect_success 'prerequisites with an empty commit message' '
 	git bundle verify bundle
 '
 
+# bundle v3 (experimental)
+test_expect_success 'clone from v3' '
+
+	# as "bundle create" does not exist yet for v3
+	# prepare it by hand here
+	head=$(git rev-parse HEAD) &&
+	name=$(echo $head | git pack-objects --revs v3) &&
+	test_when_finished "rm v3-$name.pack v3-$name.idx" &&
+	cat >v3.bndl <<-EOF &&
+	# v3 git bundle
+	data: v3-$name.pack
+
+	$head HEAD
+	$head refs/heads/master
+	EOF
+
+	git bundle verify v3.bndl &&
+	git bundle list-heads v3.bndl >actual &&
+	cat >expect <<-EOF &&
+	$head HEAD
+	$head refs/heads/master
+	EOF
+	test_cmp expect actual &&
+
+	git clone v3.bndl v3dst &&
+	git -C v3dst for-each-ref --format="%(objectname) %(refname)" >actual &&
+	cat >expect <<-EOF &&
+	$head refs/heads/master
+	$head refs/remotes/origin/HEAD
+	$head refs/remotes/origin/master
+	EOF
+	test_cmp expect actual &&
+	git -C v3dst fsck &&
+
+	# an "inline" v3 is still possible.
+	cat >v3i.bndl <<-EOF &&
+	# v3 git bundle
+
+	$head HEAD
+	$head refs/heads/master
+
+	EOF
+	cat v3-$name.pack >>v3i.bndl &&
+	test_when_finished "rm v3i.bndl" &&
+
+	git bundle verify v3i.bndl &&
+	git bundle list-heads v3i.bndl >actual &&
+	cat >expect <<-EOF &&
+	$head HEAD
+	$head refs/heads/master
+	EOF
+	test_cmp expect actual &&
+
+	git clone v3i.bndl v3idst &&
+	git -C v3idst for-each-ref --format="%(objectname) %(refname)" >actual &&
+	cat >expect <<-EOF &&
+	$head refs/heads/master
+	$head refs/remotes/origin/HEAD
+	$head refs/remotes/origin/master
+	EOF
+	test_cmp expect actual &&
+	git -C v3idst fsck
+'
+
 test_done
-- 
2.8.0-rc0-114-g0b3e5e5

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 3/4] bundle: keep a copy of bundle file name in the in-core bundle header
  2016-03-02 20:32         ` [PATCH v2 3/4] bundle: keep a copy of bundle file name in the in-core bundle header Junio C Hamano
@ 2016-03-02 20:49           ` Jeff King
  0 siblings, 0 replies; 37+ messages in thread
From: Jeff King @ 2016-03-02 20:49 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Wed, Mar 02, 2016 at 12:32:40PM -0800, Junio C Hamano wrote:

> +	free((void *)header->filename);

This cast is necessary, because...

> diff --git a/bundle.h b/bundle.h
> index f7ce23b..e059ccf 100644
> --- a/bundle.h
> +++ b/bundle.h
> @@ -10,12 +10,14 @@ struct ref_list {
>  };
>  
>  struct bundle_header {
> +	const char *filename;
>  	struct ref_list prerequisites;
>  	struct ref_list references;
>  };

...this is const, even though we know it is allocated on the heap.

I am OK if we want to keep it "conceptually const" so that anybody
looking at the struct knows they should not touch it. But I am also OK
with just making it "char *".

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-03-02 20:32         ` [PATCH v2 4/4] bundle v3: the beginning Junio C Hamano
@ 2016-03-03  1:36           ` Duy Nguyen
  2016-03-03  2:57             ` Junio C Hamano
  2016-05-20 12:39           ` Christian Couder
  1 sibling, 1 reply; 37+ messages in thread
From: Duy Nguyen @ 2016-03-03  1:36 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git Mailing List

On Thu, Mar 3, 2016 at 3:32 AM, Junio C Hamano <gitster@pobox.com> wrote:
>  - After arranging that packfile to be downloadable over popular
>    transfer methods used for serving static files (such as HTTP or
>    HTTPS) that are easily resumable as $URL/pack-$name.pack, a v3
>    bundle file (call it $name.bndl) can be prepared with an extended
>    header "data: $URL/pack-$name.pack" to point at the download
>    location for the packfile, and be served at "$URL/$name.bndl".

Extra setup to offload things to CDN is great and all. But would it be
ok if we introduced a minimal resumable download service via
git-daemon to enable this feature with very little setup? Like
git-shell, you can only download certain packfiles for this use case
and nothing else with this service.
-- 
Duy

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-03-03  1:36           ` Duy Nguyen
@ 2016-03-03  2:57             ` Junio C Hamano
  2016-03-03  5:15               ` Duy Nguyen
  0 siblings, 1 reply; 37+ messages in thread
From: Junio C Hamano @ 2016-03-03  2:57 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List

Duy Nguyen <pclouds@gmail.com> writes:

> would it be
> ok if we introduced a minimal resumable download service via
> git-daemon to enable this feature with very little setup? Like
> git-shell, you can only download certain packfiles for this use case
> and nothing else with this service.

I think it is a matter of priorities.

A minimalistic site that offers only git-daemon traffic without a
working HTTP server would certainly benefit from such a thing, but
serving static files efficiently over the web is commodity service
these days.  Wouldn't it be sufficient to just recommend having a
normal HTTP server serving static files, which should be "very
little setup" in today's world?

Such a "minimal resumable download service" over the git-daemon
transport still has to reinvent what is already done well by the
HTTP servers and clients (e.g. support of ETag equivalent to make
sure that the client can notice that the underlying data has changed
for a given resource, headers to communicate the total length,
making a range request and responding to it, etc. etc.).

In addition,, by going the custom protocol route, you wouldn't
benefit from caching HTTP proxies available to the clients.

So I am not sure if the benefit outweighs the cost.

I wouldn't stop you if you really want to do it, but again, it is a
matter of priorities.  I personally feel that it would be a waste of
engineering talent, and it certainly would be a waste of review
bandwidth, if you gave priority to this over other more widely
useful parts of the system.  The procedure to repack should be
updated to produce such a base pack with the separate bundle header
on the server side, the protocol needs to be updated to allow
redirection for "clone" traffic, the logic to decide when to
redirect must be designed (e.g. "single branch" clone should not
choose a pack/bundle that represents the full repository, but a pack
for the branch that was asked), etc.  There are still tons of things
that need to be done, and it would be a distraction to invent a
custom download service nobody other than git-daemon talks before
all of the above is done.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-03-03  2:57             ` Junio C Hamano
@ 2016-03-03  5:15               ` Duy Nguyen
  0 siblings, 0 replies; 37+ messages in thread
From: Duy Nguyen @ 2016-03-03  5:15 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git Mailing List

On Thu, Mar 3, 2016 at 9:57 AM, Junio C Hamano <gitster@pobox.com> wrote:
> Duy Nguyen <pclouds@gmail.com> writes:
>
>> would it be
>> ok if we introduced a minimal resumable download service via
>> git-daemon to enable this feature with very little setup? Like
>> git-shell, you can only download certain packfiles for this use case
>> and nothing else with this service.
>
> I think it is a matter of priorities.
>
> A minimalistic site that offers only git-daemon traffic without a
> working HTTP server would certainly benefit from such a thing, but
> serving static files efficiently over the web is commodity service
> these days.  Wouldn't it be sufficient to just recommend having a
> normal HTTP server serving static files, which should be "very
> little setup" in today's world?
>
> Such a "minimal resumable download service" over the git-daemon
> transport still has to reinvent what is already done well by the
> HTTP servers and clients (e.g. support of ETag equivalent to make
> sure that the client can notice that the underlying data has changed
> for a given resource, headers to communicate the total length,
> making a range request and responding to it, etc. etc.).
>
> In addition,, by going the custom protocol route, you wouldn't
> benefit from caching HTTP proxies available to the clients.
>
> So I am not sure if the benefit outweighs the cost.

What I had in mind was individuals who just want to publish their work
over git://. Right now it's just a matter of running git-daemon and
configuring it a bit. If it was me, I wouldn't expect all the bells
and whistles that come with http. But I agree that this is low
priority, "scratch your own itch" kind of thing. Let's have resumable
clone with standard download protocols first, then we'll see.
-- 
Duy

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-03-02 20:32         ` [PATCH v2 4/4] bundle v3: the beginning Junio C Hamano
  2016-03-03  1:36           ` Duy Nguyen
@ 2016-05-20 12:39           ` Christian Couder
  2016-05-31 12:43             ` Duy Nguyen
  2016-05-31 22:31             ` Jeff King
  1 sibling, 2 replies; 37+ messages in thread
From: Christian Couder @ 2016-05-20 12:39 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Jeff King

I am responding to this 2+ month old email because I am investigating
adding an alternate object store at the same level as loose and packed
objects. This alternate object store could be used for large files. I
am working on this for GitLab. (Yeah, I am working, as a freelance,
for both Booking.com and GitLab these days.)

On Wed, Mar 2, 2016 at 9:32 PM, Junio C Hamano <gitster@pobox.com> wrote:
> The bundle v3 format introduces an ability to have the bundle header
> (which describes what references in the bundled history can be
> fetched, and what objects the receiving repository must have in
> order to unbundle it successfully) in one file, and the bundled pack
> stream data in a separate file.
>
> A v3 bundle file begins with a line with "# v3 git bundle", followed
> by zero or more "extended header" lines, and an empty line, finally
> followed by the list of prerequisites and references in the same
> format as v2 bundle.  If it uses the "split bundle" feature, there
> is a "data: $URL" extended header line, and nothing follows the list
> of prerequisites and references.  Also, "sha1: " extended header
> line may exist to help validating that the pack stream data matches
> the bundle header.
>
> A typical expected use of a split bundle is to help initial clone
> that involves a huge data transfer, and would go like this:
>
>  - Any repository people would clone and fetch from would regularly
>    be repacked, and it is expected that there would be a packfile
>    without prerequisites that holds all (or at least most) of the
>    history of it (call it pack-$name.pack).
>
>  - After arranging that packfile to be downloadable over popular
>    transfer methods used for serving static files (such as HTTP or
>    HTTPS) that are easily resumable as $URL/pack-$name.pack, a v3
>    bundle file (call it $name.bndl) can be prepared with an extended
>    header "data: $URL/pack-$name.pack" to point at the download
>    location for the packfile, and be served at "$URL/$name.bndl".
>
>  - An updated Git client, when trying to "git clone" from such a
>    repository, may be redirected to $URL/$name.bndl", which would be
>    a tiny text file (when split bundle feature is used).
>
>  - The client would then inspect the downloaded $name.bndl, learn
>    that the corresponding packfile exists at $URL/pack-$name.pack,
>    and downloads it as pack-$name.pack, until the download succeeds.
>    This can easily be done with "wget --continue" equivalent over an
>    unreliable link.  The checksum recorded on the "sha1: " header
>    line is expected to be used by this downloader (not written yet).

I wonder if this mechanism could also be used or extended to clone and
fetch an alternate object database.

In [1], [2] and [3], and this was also discussed during the
Contributor Summit last month, Peff says that he started working on
alternate object database support a long time ago, and that the hard
part is a protocol extension to tell remotes that you can access some
objects in a different way.

If a Git client would download a "$name.bndl" v3 bundle file that
would have a "data: $URL/alt-odb-$name.odb" extended header, the Git
client would just need to download "$URL/alt-odb-$name.odb" and use
the alternate object database support on this file.

This way it would know all it has to know to access the objects in the
alternate database. The alternate object database may not contain the
real objects, if they are too big for example, but just files that
describe how to get the real objects.

>  - After fully downloading $name.bndl and pack-$name.pack and
>    storing them next to each other, the client would clone from the
>    $name.bndl; this would populate the newly created repository with
>    reasonably recent history.
>
>  - Then the client can issue "git fetch" against the original
>    repository to obtain the most recent part of the history created
>    since the bundle was made.

[1] http://thread.gmane.org/gmane.comp.version-control.git/206886/focus=207040
[2] http://thread.gmane.org/gmane.comp.version-control.git/247171
[3] http://thread.gmane.org/gmane.comp.version-control.git/202902/focus=203020

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-05-20 12:39           ` Christian Couder
@ 2016-05-31 12:43             ` Duy Nguyen
  2016-05-31 13:18               ` Christian Couder
  2016-05-31 22:23               ` Jeff King
  2016-05-31 22:31             ` Jeff King
  1 sibling, 2 replies; 37+ messages in thread
From: Duy Nguyen @ 2016-05-31 12:43 UTC (permalink / raw)
  To: Christian Couder; +Cc: Junio C Hamano, git, Jeff King

On Fri, May 20, 2016 at 7:39 PM, Christian Couder
<christian.couder@gmail.com> wrote:
> I am responding to this 2+ month old email because I am investigating
> adding an alternate object store at the same level as loose and packed
> objects. This alternate object store could be used for large files. I
> am working on this for GitLab. (Yeah, I am working, as a freelance,
> for both Booking.com and GitLab these days.)

I'm also interested in this from a different angle, narrow clone that
potentially allows to skip download some large blobs (likely old ones
from the past that nobody will bother).

> On Wed, Mar 2, 2016 at 9:32 PM, Junio C Hamano <gitster@pobox.com> wrote:
>> The bundle v3 format introduces an ability to have the bundle header
>> (which describes what references in the bundled history can be
>> fetched, and what objects the receiving repository must have in
>> order to unbundle it successfully) in one file, and the bundled pack
>> stream data in a separate file.
>>
>> A v3 bundle file begins with a line with "# v3 git bundle", followed
>> by zero or more "extended header" lines, and an empty line, finally
>> followed by the list of prerequisites and references in the same
>> format as v2 bundle.  If it uses the "split bundle" feature, there
>> is a "data: $URL" extended header line, and nothing follows the list
>> of prerequisites and references.  Also, "sha1: " extended header
>> line may exist to help validating that the pack stream data matches
>> the bundle header.
>>
>> A typical expected use of a split bundle is to help initial clone
>> that involves a huge data transfer, and would go like this:
>>
>>  - Any repository people would clone and fetch from would regularly
>>    be repacked, and it is expected that there would be a packfile
>>    without prerequisites that holds all (or at least most) of the
>>    history of it (call it pack-$name.pack).
>>
>>  - After arranging that packfile to be downloadable over popular
>>    transfer methods used for serving static files (such as HTTP or
>>    HTTPS) that are easily resumable as $URL/pack-$name.pack, a v3
>>    bundle file (call it $name.bndl) can be prepared with an extended
>>    header "data: $URL/pack-$name.pack" to point at the download
>>    location for the packfile, and be served at "$URL/$name.bndl".
>>
>>  - An updated Git client, when trying to "git clone" from such a
>>    repository, may be redirected to $URL/$name.bndl", which would be
>>    a tiny text file (when split bundle feature is used).
>>
>>  - The client would then inspect the downloaded $name.bndl, learn
>>    that the corresponding packfile exists at $URL/pack-$name.pack,
>>    and downloads it as pack-$name.pack, until the download succeeds.
>>    This can easily be done with "wget --continue" equivalent over an
>>    unreliable link.  The checksum recorded on the "sha1: " header
>>    line is expected to be used by this downloader (not written yet).
>
> I wonder if this mechanism could also be used or extended to clone and
> fetch an alternate object database.
>
> In [1], [2] and [3], and this was also discussed during the
> Contributor Summit last month, Peff says that he started working on
> alternate object database support a long time ago, and that the hard
> part is a protocol extension to tell remotes that you can access some
> objects in a different way.
>
> If a Git client would download a "$name.bndl" v3 bundle file that
> would have a "data: $URL/alt-odb-$name.odb" extended header, the Git
> client would just need to download "$URL/alt-odb-$name.odb" and use
> the alternate object database support on this file.

What does this file contain exactly? A list of SHA-1 that can be
retrieved from this remote/alternate odb? I wonder if we could just
git-replace for this marking. The replaced content could contain the
uri pointing to the alt odb. We could optionally contact alt odb to
retrieve real content, or just show the replaced/fake data when alt
odb is out of reach. Transferring git-replace is basically ref
exchange, which may be fine if you don't have a lot of objects in this
alt odb. If you do, well, we need to deal with lots of refs anyway.
This may benefit from it too.

> [3] http://thread.gmane.org/gmane.comp.version-control.git/202902/focus=203020

This points to  https://github.com/peff/git/commits/jk/external-odb
which is dead. Jeff, do you still have it somewhere, or is it not
worth looking at anymore?
-- 
Duy

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-05-31 12:43             ` Duy Nguyen
@ 2016-05-31 13:18               ` Christian Couder
  2016-06-01 13:37                 ` Duy Nguyen
  2016-06-01 14:00                 ` Duy Nguyen
  2016-05-31 22:23               ` Jeff King
  1 sibling, 2 replies; 37+ messages in thread
From: Christian Couder @ 2016-05-31 13:18 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Junio C Hamano, git, Jeff King

On Tue, May 31, 2016 at 2:43 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> On Fri, May 20, 2016 at 7:39 PM, Christian Couder
> <christian.couder@gmail.com> wrote:
>> I am responding to this 2+ month old email because I am investigating
>> adding an alternate object store at the same level as loose and packed
>> objects. This alternate object store could be used for large files. I
>> am working on this for GitLab. (Yeah, I am working, as a freelance,
>> for both Booking.com and GitLab these days.)
>
> I'm also interested in this from a different angle, narrow clone that
> potentially allows to skip download some large blobs (likely old ones
> from the past that nobody will bother).

Interesting!

[...]

>> I wonder if this mechanism could also be used or extended to clone and
>> fetch an alternate object database.
>>
>> In [1], [2] and [3], and this was also discussed during the
>> Contributor Summit last month, Peff says that he started working on
>> alternate object database support a long time ago, and that the hard
>> part is a protocol extension to tell remotes that you can access some
>> objects in a different way.
>>
>> If a Git client would download a "$name.bndl" v3 bundle file that
>> would have a "data: $URL/alt-odb-$name.odb" extended header, the Git
>> client would just need to download "$URL/alt-odb-$name.odb" and use
>> the alternate object database support on this file.
>
> What does this file contain exactly? A list of SHA-1 that can be
> retrieved from this remote/alternate odb?

It would depend on the external odb. Git could support different
external odb that have different trade-offs.

> I wonder if we could just
> git-replace for this marking. The replaced content could contain the
> uri pointing to the alt odb.

Yeah, interesting!
That's indeed another possibility that might not need the transfer of
any external odb.

But in this case it might be cleaner to just have a separate ref hierarchy like:

refs/external-odbs/my-ext-odb/<sha1>

instead of using the replace one.

Or maybe:

refs/replace/external-odbs/my-ext-odb/<sha1>

if we really want to use the replace hierarchy.

> We could optionally contact alt odb to
> retrieve real content, or just show the replaced/fake data when alt
> odb is out of reach.

Yeah, I wonder if that really needs the replace mechanism.

> Transferring git-replace is basically ref
> exchange, which may be fine if you don't have a lot of objects in this
> alt odb.

Yeah sure, great idea!

By the way this makes me wonder if we could implement resumable clone
using some kind of replace ref.

The client while cloning nearly as usual would download one or more
special replace refs that would points to objects with links to
download bundles using standard protocols.
Just after the clone, the client would read these objects and download
the bundles from these objects.
And then it would clone from these bundles.

> If you do, well, we need to deal with lots of refs anyway.
> This may benefit from it too.
>
>> [3] http://thread.gmane.org/gmane.comp.version-control.git/202902/focus=203020
>
> This points to  https://github.com/peff/git/commits/jk/external-odb
> which is dead. Jeff, do you still have it somewhere, or is it not
> worth looking at anymore?

I have rebased, fixed and improved it a bit. I added write support for
blobs. But the result is not very clean right now.
I was going to send a RFC patch series after cleaning the result, but
as you ask, here are some links to some branches:

- https://github.com/chriscool/git/commits/gl-external-odb3 (the
updated patches from Peff, plus 2 small patches from me)
- https://github.com/chriscool/git/commits/gl-external-odb7 (the same
as above, plus a number of WIP patches to add blob write support)

Thanks,
Christian.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-05-31 12:43             ` Duy Nguyen
  2016-05-31 13:18               ` Christian Couder
@ 2016-05-31 22:23               ` Jeff King
  1 sibling, 0 replies; 37+ messages in thread
From: Jeff King @ 2016-05-31 22:23 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Christian Couder, Junio C Hamano, git

On Tue, May 31, 2016 at 07:43:27PM +0700, Duy Nguyen wrote:

> > [3] http://thread.gmane.org/gmane.comp.version-control.git/202902/focus=203020
> 
> This points to  https://github.com/peff/git/commits/jk/external-odb
> which is dead. Jeff, do you still have it somewhere, or is it not
> worth looking at anymore?

It's now "jk/external-odb-wip" at the same repo. I wouldn't be surprised
if it doesn't even compile, though. I basically rebase my topics daily
against Junio's "master", so it may be carried forward, but things
marked "-wip" aren't part of my daily git build, and generally don't
even get compile-tested (usually if the rebase looks too hairy or awful,
I'll drop it completely, though, and I haven't done that here).

You're probably better off looking whatever Christian produces. :)

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-05-20 12:39           ` Christian Couder
  2016-05-31 12:43             ` Duy Nguyen
@ 2016-05-31 22:31             ` Jeff King
  2016-06-07 13:19               ` Christian Couder
  1 sibling, 1 reply; 37+ messages in thread
From: Jeff King @ 2016-05-31 22:31 UTC (permalink / raw)
  To: Christian Couder; +Cc: Junio C Hamano, git

On Fri, May 20, 2016 at 02:39:06PM +0200, Christian Couder wrote:

> I wonder if this mechanism could also be used or extended to clone and
> fetch an alternate object database.
> 
> In [1], [2] and [3], and this was also discussed during the
> Contributor Summit last month, Peff says that he started working on
> alternate object database support a long time ago, and that the hard
> part is a protocol extension to tell remotes that you can access some
> objects in a different way.
> 
> If a Git client would download a "$name.bndl" v3 bundle file that
> would have a "data: $URL/alt-odb-$name.odb" extended header, the Git
> client would just need to download "$URL/alt-odb-$name.odb" and use
> the alternate object database support on this file.
> 
> This way it would know all it has to know to access the objects in the
> alternate database. The alternate object database may not contain the
> real objects, if they are too big for example, but just files that
> describe how to get the real objects.

I'm not sure about this strategy. I see two complications:

  1. I don't think bundles need to be a part of this "external odb"
     strategy at all. If I understand correctly, I think you want to use
     it as a place to stuff metadata that the server tells the client,
     like "by the way, go here if you want another way to access some
     objects".

     But there are lots of cases where the server might want to tell
     the client that don't involve bundles at all.

  2. A server pointing the client to another object store is actually
     the least interesting bit of the protocol.

     The more interesting cases (to me) are:

       a. The receiving side of a connection (e.g., a fetch client)
          somehow has out-of-band access to some objects. How does it
	  tell the other side "do not bother sending me these objects; I
	  can get them in another way"?

       b. The receiving side of a connection has out-of-band access to
          some objects. Some of these will be expensive to get (e.g.,
	  requiring a large download), and some may be fast (e.g.,
	  they've already been fetched to a local cache). How do we tell
	  the sending side not to assume we have cheap access to these
	  objects (e.g., for use as a delta base)?

So I don't think you want to tie this into bundles due to (1), and I
think that bundles would be insufficient anyway because of (2).

Or maybe I'm misunderstanding what you propose.

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-05-31 13:18               ` Christian Couder
@ 2016-06-01 13:37                 ` Duy Nguyen
  2016-06-07 14:49                   ` Christian Couder
  2016-06-01 14:00                 ` Duy Nguyen
  1 sibling, 1 reply; 37+ messages in thread
From: Duy Nguyen @ 2016-06-01 13:37 UTC (permalink / raw)
  To: Christian Couder; +Cc: Junio C Hamano, git, Jeff King

On Tue, May 31, 2016 at 8:18 PM, Christian Couder
<christian.couder@gmail.com> wrote:
>>> I wonder if this mechanism could also be used or extended to clone and
>>> fetch an alternate object database.
>>>
>>> In [1], [2] and [3], and this was also discussed during the
>>> Contributor Summit last month, Peff says that he started working on
>>> alternate object database support a long time ago, and that the hard
>>> part is a protocol extension to tell remotes that you can access some
>>> objects in a different way.
>>>
>>> If a Git client would download a "$name.bndl" v3 bundle file that
>>> would have a "data: $URL/alt-odb-$name.odb" extended header, the Git
>>> client would just need to download "$URL/alt-odb-$name.odb" and use
>>> the alternate object database support on this file.
>>
>> What does this file contain exactly? A list of SHA-1 that can be
>> retrieved from this remote/alternate odb?
>
> It would depend on the external odb. Git could support different
> external odb that have different trade-offs.
>
>> I wonder if we could just
>> git-replace for this marking. The replaced content could contain the
>> uri pointing to the alt odb.
>
> Yeah, interesting!
> That's indeed another possibility that might not need the transfer of
> any external odb.
>
> But in this case it might be cleaner to just have a separate ref hierarchy like:
>
> refs/external-odbs/my-ext-odb/<sha1>
>
> instead of using the replace one.
>
> Or maybe:
>
> refs/replace/external-odbs/my-ext-odb/<sha1>
>
> if we really want to use the replace hierarchy.

Yep. replace hierarchy crossed my mind. But then I thought about
performance degradation when there are more than one pack (we have to
search through them all for every SHA-1) and discarded it because we
would need to do the same linear search here. I guess we will most
likely have one or two name spaces so it probably won't matter.

>> We could optionally contact alt odb to
>> retrieve real content, or just show the replaced/fake data when alt
>> odb is out of reach.
>
> Yeah, I wonder if that really needs the replace mechanism.

Replace mechanism provides good hook point. But it really depends how
invasive this remote odb is. If a fake content is enough to avoid
breakages up high, git-replace is enough. If you really need to pass
remote odb info up so higher levels can do something more fancy, then
it's insufficient.

> By the way this makes me wonder if we could implement resumable clone
> using some kind of replace ref.
>
> The client while cloning nearly as usual would download one or more
> special replace refs that would points to objects with links to
> download bundles using standard protocols.
> Just after the clone, the client would read these objects and download
> the bundles from these objects.
> And then it would clone from these bundles.

I thought we have settled on resumable clone, just waiting for an
implementation :) Doing it your way, you would need to download these
special objects too (in a pack?) and come back download some more
bundles. It would be more efficient to show the bundle uri early and
go download the bundle on the side while you go on to get the
addition/smaller pack that contains the rest.
-- 
Duy

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-05-31 13:18               ` Christian Couder
  2016-06-01 13:37                 ` Duy Nguyen
@ 2016-06-01 14:00                 ` Duy Nguyen
  2016-06-07  8:46                   ` Christian Couder
  1 sibling, 1 reply; 37+ messages in thread
From: Duy Nguyen @ 2016-06-01 14:00 UTC (permalink / raw)
  To: Christian Couder; +Cc: Junio C Hamano, git, Jeff King

On Tue, May 31, 2016 at 8:18 PM, Christian Couder
<christian.couder@gmail.com> wrote:
>>> [3] http://thread.gmane.org/gmane.comp.version-control.git/202902/focus=203020
>>
>> This points to  https://github.com/peff/git/commits/jk/external-odb
>> which is dead. Jeff, do you still have it somewhere, or is it not
>> worth looking at anymore?
>
> I have rebased, fixed and improved it a bit. I added write support for
> blobs. But the result is not very clean right now.
> I was going to send a RFC patch series after cleaning the result, but
> as you ask, here are some links to some branches:
>
> - https://github.com/chriscool/git/commits/gl-external-odb3 (the
> updated patches from Peff, plus 2 small patches from me)
> - https://github.com/chriscool/git/commits/gl-external-odb7 (the same
> as above, plus a number of WIP patches to add blob write support)

Thanks. I had a super quick look. It would be nice if you could give a
high level overview on this (if you're going to spend a lot more time on it).

One random thought, maybe it's better to have a daemon for external
odb right from the start (one for all odbs, or one per odb, I don't
know). It could do fancy stuff like object caching if necessary, and
it can avoid high cost handshake (e.g. via tls) every time a git
process runs and gets one object. Reducing process spawn would
definitely receive a big cheer from Windows crowd.

Any thought on object streaming support? It could be a big deal (might
affect some design decisions). I would also think about how pack v4
fits in this (e.g. how a tree walker can still walk fast, a big
promise of pack v4; I suppose if you still maintain "pack" concept
over external odb then it might work). Not that it really matters.
Pack v4 is the future, but the future can never be "today" :)
-- 
Duy

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-06-01 14:00                 ` Duy Nguyen
@ 2016-06-07  8:46                   ` Christian Couder
  2016-06-07  8:53                     ` Mike Hommey
                                       ` (2 more replies)
  0 siblings, 3 replies; 37+ messages in thread
From: Christian Couder @ 2016-06-07  8:46 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Junio C Hamano, git, Jeff King

On Wed, Jun 1, 2016 at 4:00 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> On Tue, May 31, 2016 at 8:18 PM, Christian Couder
> <christian.couder@gmail.com> wrote:
>>>> [3] http://thread.gmane.org/gmane.comp.version-control.git/202902/focus=203020
>>>
>>> This points to  https://github.com/peff/git/commits/jk/external-odb
>>> which is dead. Jeff, do you still have it somewhere, or is it not
>>> worth looking at anymore?
>>
>> I have rebased, fixed and improved it a bit. I added write support for
>> blobs. But the result is not very clean right now.
>> I was going to send a RFC patch series after cleaning the result, but
>> as you ask, here are some links to some branches:
>>
>> - https://github.com/chriscool/git/commits/gl-external-odb3 (the
>> updated patches from Peff, plus 2 small patches from me)
>> - https://github.com/chriscool/git/commits/gl-external-odb7 (the same
>> as above, plus a number of WIP patches to add blob write support)
>
> Thanks. I had a super quick look. It would be nice if you could give a
> high level overview on this (if you're going to spend a lot more time on it).

Sorry about the late answer.

Here is a new series after some cleanup:

https://github.com/chriscool/git/commits/gl-external-odb12

The high level overview of the patch series I would like to send
really soon now could go like this:

---
Git can store its objects only in the form of loose objects in
separate files or packed objects in a pack file.
To be able to better handle some kind of objects, for example big
blobs, it would be nice if Git could store its objects in other object
databases (ODB).

To do that, this patch series makes it possible to register commands,
using "odb.<odbname>.command" config variables, to access external
ODBs. Each specified command will then be called the following ways:

  - "<command> have": the command should output the sha1, size and
type of all the objects the external ODB contains, one object per
line.
  - "<command> get <sha1>": the command should then read from the
external ODB the content of the object corresponding to <sha1> and
output it on stdout.
  - "<command> put <sha1> <size> <type>": the command should then read
from stdin an object and store it in the external ODB.

This RFC patch series does not address the following important parts
of a complete solution:

  - There is no way to transfer external ODB content using Git.
  - No real external ODB has been interfaced with Git. The tests use
another git repo in a separate directory for this purpose which is
probably useless in the real world.
---

> One random thought, maybe it's better to have a daemon for external
> odb right from the start (one for all odbs, or one per odb, I don't
> know). It could do fancy stuff like object caching if necessary, and
> it can avoid high cost handshake (e.g. via tls) every time a git
> process runs and gets one object. Reducing process spawn would
> definitely receive a big cheer from Windows crowd.

The caching could be done inside Git and I am not sure it's worth
optimizing this now.
It could also make it more difficult to write support for an external
ODB if we required a daemon.
Maybe later we can add support for "odb.<odbname>.daemon" if we think
that this is worth it.

> Any thought on object streaming support?

No I didn't think about this. In fact I am not sure what this means.

> It could be a big deal (might
> affect some design decisions).

Could you elaborate on this?

> I would also think about how pack v4
> fits in this (e.g. how a tree walker can still walk fast, a big
> promise of pack v4; I suppose if you still maintain "pack" concept
> over external odb then it might work). Not that it really matters.
> Pack v4 is the future, but the future can never be "today" :)

Sorry I haven't really followed pack v4 and I forgot what it is about.

Thanks,
Christian.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-06-07  8:46                   ` Christian Couder
@ 2016-06-07  8:53                     ` Mike Hommey
  2016-06-07 10:22                     ` Duy Nguyen
  2016-06-07 19:23                     ` Junio C Hamano
  2 siblings, 0 replies; 37+ messages in thread
From: Mike Hommey @ 2016-06-07  8:53 UTC (permalink / raw)
  To: Christian Couder; +Cc: Duy Nguyen, Junio C Hamano, git, Jeff King

On Tue, Jun 07, 2016 at 10:46:07AM +0200, Christian Couder wrote:
> The high level overview of the patch series I would like to send
> really soon now could go like this:
> 
> ---
> Git can store its objects only in the form of loose objects in
> separate files or packed objects in a pack file.
> To be able to better handle some kind of objects, for example big
> blobs, it would be nice if Git could store its objects in other object
> databases (ODB).
> 
> To do that, this patch series makes it possible to register commands,
> using "odb.<odbname>.command" config variables, to access external
> ODBs. Each specified command will then be called the following ways:
> 
>   - "<command> have": the command should output the sha1, size and
> type of all the objects the external ODB contains, one object per
> line.
>   - "<command> get <sha1>": the command should then read from the
> external ODB the content of the object corresponding to <sha1> and
> output it on stdout.
>   - "<command> put <sha1> <size> <type>": the command should then read
> from stdin an object and store it in the external ODB.

(disclaimer: I didn't look at the patch series)

Does this mean you're going to fork/exec() a new <command> for each of
these? It would probably be better if it was "batched", where the
executable is invoked once and the commands are passed to its stdin.

Mike

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-06-07  8:46                   ` Christian Couder
  2016-06-07  8:53                     ` Mike Hommey
@ 2016-06-07 10:22                     ` Duy Nguyen
  2016-06-07 19:23                     ` Junio C Hamano
  2 siblings, 0 replies; 37+ messages in thread
From: Duy Nguyen @ 2016-06-07 10:22 UTC (permalink / raw)
  To: Christian Couder; +Cc: Junio C Hamano, git, Jeff King

On Tue, Jun 7, 2016 at 3:46 PM, Christian Couder
<christian.couder@gmail.com> wrote:
>> Any thought on object streaming support?
>
> No I didn't think about this. In fact I am not sure what this means.
>
>> It could be a big deal (might
>> affect some design decisions).
>
> Could you elaborate on this?

Object streaming api is in streaming.h. Normally objects are small and
we can inflate the whole thing in memory before doing anything with
them. For really large objects (which I guess is one of the reasons
for remote odb) we don't want to do that. It takes lots of memory and
you could have objects larger than your physical memory. In some cases
when can ignore those objects (e.g. mark them binary and choose not to
diff). In some other cases (e.g. checkout), we use streaming interface
to process an object while we're inflating it to keep memory usage
down. It's easy to add a new streaming backend, once you settle on how
remote odb streams stuff.

>> I would also think about how pack v4
>> fits in this (e.g. how a tree walker can still walk fast, a big
>> promise of pack v4; I suppose if you still maintain "pack" concept
>> over external odb then it might work). Not that it really matters.
>> Pack v4 is the future, but the future can never be "today" :)
>
> Sorry I haven't really followed pack v4 and I forgot what it is about.

It's a new pack format (and practically vaporware at this point) that
promises much faster access when you need to walk through trees and
commits (think rev-list --objects --all, or git-blame). Because we are
(or I am) still not sure if pack v4 will ever get to the state where
it can be merged to git.git, I think it's ok for you to ignore it too
if you want. You can read more about the format here [1] and go even
further back to [2] when Nicolas teased us with the pack size
(smaller, which is a nice side effect). The potential issue with pack
v4 is, the tree walker (struct tree_desc and related funcs in
walk-tree.h) needs to know about pack v4 in order to walk fast.
Current tree walker does not care if an object is packed (using what
format) at all. Remote odb for pack v4 must have some way that allows
to read pack data directly, something close to "mmap", it's not just
about an api to "get me the canonical content of this object".

[1] http://article.gmane.org/gmane.comp.version-control.git/234012
[2] http://article.gmane.org/gmane.comp.version-control.git/233038
-- 
Duy

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-05-31 22:31             ` Jeff King
@ 2016-06-07 13:19               ` Christian Couder
  2016-06-07 20:35                 ` Jeff King
  0 siblings, 1 reply; 37+ messages in thread
From: Christian Couder @ 2016-06-07 13:19 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, git

On Wed, Jun 1, 2016 at 12:31 AM, Jeff King <peff@peff.net> wrote:
> On Fri, May 20, 2016 at 02:39:06PM +0200, Christian Couder wrote:
>
>> I wonder if this mechanism could also be used or extended to clone and
>> fetch an alternate object database.
>>
>> In [1], [2] and [3], and this was also discussed during the
>> Contributor Summit last month, Peff says that he started working on
>> alternate object database support a long time ago, and that the hard
>> part is a protocol extension to tell remotes that you can access some
>> objects in a different way.
>>
>> If a Git client would download a "$name.bndl" v3 bundle file that
>> would have a "data: $URL/alt-odb-$name.odb" extended header, the Git
>> client would just need to download "$URL/alt-odb-$name.odb" and use
>> the alternate object database support on this file.
>>
>> This way it would know all it has to know to access the objects in the
>> alternate database. The alternate object database may not contain the
>> real objects, if they are too big for example, but just files that
>> describe how to get the real objects.
>
> I'm not sure about this strategy.

I am also not sure that this is the best strategy, but I think it's
worth discussing.

> I see two complications:
>
>   1. I don't think bundles need to be a part of this "external odb"
>      strategy at all. If I understand correctly, I think you want to use
>      it as a place to stuff metadata that the server tells the client,
>      like "by the way, go here if you want another way to access some
>      objects".

Yeah, basically I think it might be possible to use the bundle
mechanism to transfer what an external ODB on the client would need to
be initialized or updated.

>      But there are lots of cases where the server might want to tell
>      the client that don't involve bundles at all.

The idea is also that anytime the server needs to send external ODB
data to the client, it would ask its own external ODB to prepare a
kind of bundle with that data and use the bundle v3 mechanism to send
it.
That may need the bundle v3 mechanism to be extended, but I don't see
in which cases it would not work.

>   2. A server pointing the client to another object store is actually
>      the least interesting bit of the protocol.
>
>      The more interesting cases (to me) are:
>
>        a. The receiving side of a connection (e.g., a fetch client)
>           somehow has out-of-band access to some objects. How does it
>           tell the other side "do not bother sending me these objects; I
>           can get them in another way"?

I don't see a difference with regular objects that the fetch client
already has. If it already has some regular objects, a way to tell the
server "don't bother sending me these objects" is useful already and
it should be possible to use it to tell the server that there is no
need to send some objects stored in the external ODB too.

Also something like this is needed for shallow clones and narrow clones anyway.

>        b. The receiving side of a connection has out-of-band access to
>           some objects. Some of these will be expensive to get (e.g.,
>           requiring a large download), and some may be fast (e.g.,
>           they've already been fetched to a local cache). How do we tell
>           the sending side not to assume we have cheap access to these
>           objects (e.g., for use as a delta base)?

I don't think we need to tell the sending side we have cheap access or
not to some objects.
If the objects are managed by the external ODB, it's the external ODB
on the server and on the client that will manage these objects. They
should not be used as delta bases.
Perhaps there is no mechanism to say that some objects (basically all
external ODB managed objects) should not be used as delta bases, but
that could be added.

Thanks,
Christian.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-06-01 13:37                 ` Duy Nguyen
@ 2016-06-07 14:49                   ` Christian Couder
  0 siblings, 0 replies; 37+ messages in thread
From: Christian Couder @ 2016-06-07 14:49 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Junio C Hamano, git, Jeff King

On Wed, Jun 1, 2016 at 3:37 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> On Tue, May 31, 2016 at 8:18 PM, Christian Couder
> <christian.couder@gmail.com> wrote:
>>>> I wonder if this mechanism could also be used or extended to clone and
>>>> fetch an alternate object database.
>>>>
>>>> In [1], [2] and [3], and this was also discussed during the
>>>> Contributor Summit last month, Peff says that he started working on
>>>> alternate object database support a long time ago, and that the hard
>>>> part is a protocol extension to tell remotes that you can access some
>>>> objects in a different way.
>>>>
>>>> If a Git client would download a "$name.bndl" v3 bundle file that
>>>> would have a "data: $URL/alt-odb-$name.odb" extended header, the Git
>>>> client would just need to download "$URL/alt-odb-$name.odb" and use
>>>> the alternate object database support on this file.
>>>
>>> What does this file contain exactly? A list of SHA-1 that can be
>>> retrieved from this remote/alternate odb?
>>
>> It would depend on the external odb. Git could support different
>> external odb that have different trade-offs.
>>
>>> I wonder if we could just
>>> git-replace for this marking. The replaced content could contain the
>>> uri pointing to the alt odb.
>>
>> Yeah, interesting!
>> That's indeed another possibility that might not need the transfer of
>> any external odb.
>>
>> But in this case it might be cleaner to just have a separate ref hierarchy like:
>>
>> refs/external-odbs/my-ext-odb/<sha1>
>>
>> instead of using the replace one.
>>
>> Or maybe:
>>
>> refs/replace/external-odbs/my-ext-odb/<sha1>
>>
>> if we really want to use the replace hierarchy.
>
> Yep. replace hierarchy crossed my mind. But then I thought about
> performance degradation when there are more than one pack (we have to
> search through them all for every SHA-1) and discarded it because we
> would need to do the same linear search here. I guess we will most
> likely have one or two name spaces so it probably won't matter.

Yeah.

>>> We could optionally contact alt odb to
>>> retrieve real content, or just show the replaced/fake data when alt
>>> odb is out of reach.
>>
>> Yeah, I wonder if that really needs the replace mechanism.
>
> Replace mechanism provides good hook point. But it really depends how
> invasive this remote odb is. If a fake content is enough to avoid
> breakages up high, git-replace is enough. If you really need to pass
> remote odb info up so higher levels can do something more fancy, then
> it's insufficient.
>
>> By the way this makes me wonder if we could implement resumable clone
>> using some kind of replace ref.
>>
>> The client while cloning nearly as usual would download one or more
>> special replace refs that would points to objects with links to
>> download bundles using standard protocols.
>> Just after the clone, the client would read these objects and download
>> the bundles from these objects.
>> And then it would clone from these bundles.
>
> I thought we have settled on resumable clone, just waiting for an
> implementation :) Doing it your way, you would need to download these
> special objects too (in a pack?) and come back download some more
> bundles. It would be more efficient to show the bundle uri early and
> go download the bundle on the side while you go on to get the
> addition/smaller pack that contains the rest.

Yeah, something like the bundle v3 mechanism is probably more efficient.

Thanks,
Christian.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-06-07  8:46                   ` Christian Couder
  2016-06-07  8:53                     ` Mike Hommey
  2016-06-07 10:22                     ` Duy Nguyen
@ 2016-06-07 19:23                     ` Junio C Hamano
  2016-06-07 20:23                       ` Jeff King
  2 siblings, 1 reply; 37+ messages in thread
From: Junio C Hamano @ 2016-06-07 19:23 UTC (permalink / raw)
  To: Christian Couder; +Cc: Duy Nguyen, git, Jeff King

Christian Couder <christian.couder@gmail.com> writes:

> Git can store its objects only in the form of loose objects in
> separate files or packed objects in a pack file.
> To be able to better handle some kind of objects, for example big
> blobs, it would be nice if Git could store its objects in other object
> databases (ODB).
>
> To do that, this patch series makes it possible to register commands,
> using "odb.<odbname>.command" config variables, to access external
> ODBs. Each specified command will then be called the following ways:

Hopefully it is done via a cheap RPC instead of forking/execing the
command for each and every object lookup.

>   - "<command> have": the command should output the sha1, size and
> type of all the objects the external ODB contains, one object per
> line.

Why size and type at this point is needed by the clients?  That is
more expensive to compute than just a bare list of object names.

>   - "<command> get <sha1>": the command should then read from the
> external ODB the content of the object corresponding to <sha1> and
> output it on stdout.

The type and size should be given at this point.

>   - "<command> put <sha1> <size> <type>": the command should then read
> from stdin an object and store it in the external ODB.

Is ODB required to sanity check that <sha1> matches what the data
hashes down to?

If this thing is primarily to offload large blobs, you might also
want not "get" but "checkout <sha1> <path>" to bypass Git entirely,
but I haven't thought it through.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-06-07 19:23                     ` Junio C Hamano
@ 2016-06-07 20:23                       ` Jeff King
  2016-06-08 10:44                         ` Duy Nguyen
  2016-06-08 18:05                         ` Junio C Hamano
  0 siblings, 2 replies; 37+ messages in thread
From: Jeff King @ 2016-06-07 20:23 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Christian Couder, Duy Nguyen, git

On Tue, Jun 07, 2016 at 12:23:40PM -0700, Junio C Hamano wrote:

> Christian Couder <christian.couder@gmail.com> writes:
> 
> > Git can store its objects only in the form of loose objects in
> > separate files or packed objects in a pack file.
> > To be able to better handle some kind of objects, for example big
> > blobs, it would be nice if Git could store its objects in other object
> > databases (ODB).
> >
> > To do that, this patch series makes it possible to register commands,
> > using "odb.<odbname>.command" config variables, to access external
> > ODBs. Each specified command will then be called the following ways:
> 
> Hopefully it is done via a cheap RPC instead of forking/execing the
> command for each and every object lookup.

This interface comes from my earlier patches, so I'll try to shed a
little light on the decisions I made there.

Because this "external odb" essentially acts as a git alternate, we
would hit it only when we couldn't find an object through regular means.
Git would then make the object available in the usual on-disk format
(probably as a loose object).

So in most processes, we would not need to consult the odb command at
all. And when we do, the first thing would be to get its "have" list,
which would at most run once per process.

So the per-object cost is really calling "get", and my assumption there
was that the cost of actually retrieving the object over the network
would dwarf the fork/exec cost.

I also waffled on having git cache the output of "<command> have" in
some fast-lookup format to save even the single fork/exec. But I figured
that was something that could be added later if needed.

You'll note that this is sort of a "fault-in" model. Another model would
be to treat external odb updates similar to fetches. I.e., we touch the
network only during a special update operation, and then try to work
locally with whatever the external odb has. IMHO this policy could
actually be up to the external odb itself (i.e., its "have" command
could serve from a local cache if it likes).

> >   - "<command> have": the command should output the sha1, size and
> > type of all the objects the external ODB contains, one object per
> > line.
> 
> Why size and type at this point is needed by the clients?  That is
> more expensive to compute than just a bare list of object names.

Yes, but it lets get avoid doing a lot of "get" operations. For example,
in a regular diff without binary-diffs enabled, we can automatically
determine that a diff will be considered binary based purely on the size
of the objects (related to core.bigfilethreshold). So if we know the
sizes, we can run "git log -p" without faulting-in each of the objects
just to say "woah, that looks binary".

One can accomplish this with .gitattributes, too, of course, but the
size thing just works out of the box.

There are other places where it will come in handy, too. E.g., fscking a
tree object you have, you want to make sure that the object referred to
with mode 100644 is actually a blob.

I also don't think the cost to compute size and type on the server is
all that important. Yes, if you're backing your external odb with a git
repository that runs "git cat-file" on the fly, it is more expensive.
But in practice, I'd expect the server side to create a static manifest
and serve it over HTTP (this also gives the benefit of things like
ETags).

> >   - "<command> get <sha1>": the command should then read from the
> > external ODB the content of the object corresponding to <sha1> and
> > output it on stdout.
> 
> The type and size should be given at this point.

I don't think there's a reason not to; I didn't here because it would be
redundant with what Git already knows from the "have" manifest it
receives above.

> >   - "<command> put <sha1> <size> <type>": the command should then read
> > from stdin an object and store it in the external ODB.
> 
> Is ODB required to sanity check that <sha1> matches what the data
> hashes down to?

I think that would be up to the ODB, but it does seem like a good idea.

Likewise, I'm not sure if "get" should be allowed to return contents
that don't match the sha1. That would be fine for things like "diff",
but would probably make "fsck" unhappy.

> If this thing is primarily to offload large blobs, you might also
> want not "get" but "checkout <sha1> <path>" to bypass Git entirely,
> but I haven't thought it through.

My mental model is that the external odb gets the object into the local
odb, and then you can use the regular streaming-checkout code path. And
the local odb serves as your cache.

That does mean you might have two copies of each object (one in the odb,
and one in the working tree), as opposed to a true cacheless system,
which can get away with one.

I think you could do that cacheless thing with the interface here,
though; the "get" operation can stream, and you can stream directly to
the working tree.

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-06-07 13:19               ` Christian Couder
@ 2016-06-07 20:35                 ` Jeff King
  0 siblings, 0 replies; 37+ messages in thread
From: Jeff King @ 2016-06-07 20:35 UTC (permalink / raw)
  To: Christian Couder; +Cc: Junio C Hamano, git

On Tue, Jun 07, 2016 at 03:19:46PM +0200, Christian Couder wrote:

> >      But there are lots of cases where the server might want to tell
> >      the client that don't involve bundles at all.
> 
> The idea is also that anytime the server needs to send external ODB
> data to the client, it would ask its own external ODB to prepare a
> kind of bundle with that data and use the bundle v3 mechanism to send
> it.
> That may need the bundle v3 mechanism to be extended, but I don't see
> in which cases it would not work.

Ah, I see we do not have the same underlying mental model.

I think the external odb is purely the _client's_ business. The server
does not have to have an external odb at all, and does not need to know
about the client's. The client is responsible for telling the server
during the git protocol anything it would need to know (like "do not
bother sending objects over 50MB; I can get them elsewhere").

This makes the problem much more complicated, but it is more flexible
and decentralized.

> >        a. The receiving side of a connection (e.g., a fetch client)
> >           somehow has out-of-band access to some objects. How does it
> >           tell the other side "do not bother sending me these objects; I
> >           can get them in another way"?
> 
> I don't see a difference with regular objects that the fetch client
> already has. If it already has some regular objects, a way to tell the
> server "don't bother sending me these objects" is useful already and
> it should be possible to use it to tell the server that there is no
> need to send some objects stored in the external ODB too.

The way to do that with normal objects is by finding shared commit tips,
and assuming the normal git repository property of "if you have X, you
have all of the objects reachable from X".

This whole idea is essentially creating "holes" in that property. You
can enumerate all of the holes, but I am not sure that scales well. We
get a lot of efficiency by communicating only ref tips during the
negotiation, and not individual object names.

> Also something like this is needed for shallow clones and narrow
> clones anyway.

Yes, and I don't think it scales well there, either. A single shallow
cutoff works OK. But if you repeatedly shallow-fetch into a repository,
you end up with a patchwork of disconnected "islands" of history. The
CPU required on the server side to serve those fetch requests is much
greater than what would normally be needed. You can't use things like
reachability bitmaps, and you have to open up the trees for each island
to see which objects the other side actually has.

> >        b. The receiving side of a connection has out-of-band access to
> >           some objects. Some of these will be expensive to get (e.g.,
> >           requiring a large download), and some may be fast (e.g.,
> >           they've already been fetched to a local cache). How do we tell
> >           the sending side not to assume we have cheap access to these
> >           objects (e.g., for use as a delta base)?
> 
> I don't think we need to tell the sending side we have cheap access or
> not to some objects.
> If the objects are managed by the external ODB, it's the external ODB
> on the server and on the client that will manage these objects. They
> should not be used as delta bases.
> Perhaps there is no mechanism to say that some objects (basically all
> external ODB managed objects) should not be used as delta bases, but
> that could be added.

Yes, I agree that _if_ the server can access the list of objects
available in the external odb, this becomes much easier. I'm just not
convinced that level of coupling is a good idea.

Note that the server would also want to take this into account during
repacking, as otherwise you end up with fetches that are very expensive
to serve (you want to send X which is a delta based on Y, but you know
that Y is available via the external odb, and therefore should not be
used as a base. So you have to throw out the delta for X and either send
it whole or compute a new one. That's much more expensive than blitting
the delta from disk, which is what a normal clone would do).

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-06-07 20:23                       ` Jeff King
@ 2016-06-08 10:44                         ` Duy Nguyen
  2016-06-08 16:19                           ` Jeff King
  2016-06-08 18:05                         ` Junio C Hamano
  1 sibling, 1 reply; 37+ messages in thread
From: Duy Nguyen @ 2016-06-08 10:44 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Christian Couder, git

On Wed, Jun 8, 2016 at 3:23 AM, Jeff King <peff@peff.net> wrote:
> Because this "external odb" essentially acts as a git alternate, we
> would hit it only when we couldn't find an object through regular means.
> Git would then make the object available in the usual on-disk format
> (probably as a loose object).

This means git-gc (and all things that do rev-list --objects --all)
would download at least all trees and commits? Or will we have special
treatment for those commands?
-- 
Duy

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-06-08 10:44                         ` Duy Nguyen
@ 2016-06-08 16:19                           ` Jeff King
  2016-06-09  8:53                             ` Duy Nguyen
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff King @ 2016-06-08 16:19 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Junio C Hamano, Christian Couder, git

On Wed, Jun 08, 2016 at 05:44:06PM +0700, Duy Nguyen wrote:

> On Wed, Jun 8, 2016 at 3:23 AM, Jeff King <peff@peff.net> wrote:
> > Because this "external odb" essentially acts as a git alternate, we
> > would hit it only when we couldn't find an object through regular means.
> > Git would then make the object available in the usual on-disk format
> > (probably as a loose object).
> 
> This means git-gc (and all things that do rev-list --objects --all)
> would download at least all trees and commits? Or will we have special
> treatment for those commands?

Yes. To me, this was always about punting large blobs from the clones.
Basically the way git-lfs and other tools work, but without munging your
history permanently.

I don't know if Christian had other cases in mind (like the many-files
case, which I think is better served by something like narrow clones).

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-06-07 20:23                       ` Jeff King
  2016-06-08 10:44                         ` Duy Nguyen
@ 2016-06-08 18:05                         ` Junio C Hamano
  2016-06-08 19:00                           ` Jeff King
  1 sibling, 1 reply; 37+ messages in thread
From: Junio C Hamano @ 2016-06-08 18:05 UTC (permalink / raw)
  To: Jeff King; +Cc: Christian Couder, Duy Nguyen, git

Jeff King <peff@peff.net> writes:

> This interface comes from my earlier patches, so I'll try to shed a
> little light on the decisions I made there.
>
> Because this "external odb" essentially acts as a git alternate, we
> would hit it only when we couldn't find an object through regular means.
> Git would then make the object available in the usual on-disk format
> (probably as a loose object).
>
> So in most processes, we would not need to consult the odb command at
> all. And when we do, the first thing would be to get its "have" list,
> which would at most run once per process.
>
> So the per-object cost is really calling "get", and my assumption there
> was that the cost of actually retrieving the object over the network
> would dwarf the fork/exec cost.

OK, presented that way, the design makes sense (I do not know if
Christian's (revised) design and implementation does or not, though,
as I haven't seen it).

As "check for non-existence" is important and costly, grabbing
"have" once is a good strategy, just like we open the .idx files of
available packfiles.

>> >   - "<command> have": the command should output the sha1, size and
>> > type of all the objects the external ODB contains, one object per
>> > line.
>> 
>> Why size and type at this point is needed by the clients?  That is
>> more expensive to compute than just a bare list of object names.
>
> Yes, but it lets get avoid doing a lot of "get" operations.

OK, so it is more like having richer information in pack-v4 index ;-)

>> >   - "<command> put <sha1> <size> <type>": the command should then read
>> > from stdin an object and store it in the external ODB.
>> 
>> Is ODB required to sanity check that <sha1> matches what the data
>> hashes down to?
>
> I think that would be up to the ODB, but it does seem like a good idea.
>
> Likewise, I'm not sure if "get" should be allowed to return contents
> that don't match the sha1.

Yes, this is what I was getting at.  It would be ideal to come up
with a way to do the large-blob offload without resorting to hacks
(like LFS and annex where "the same object contents will always
result in the same object name" is deliberately broken), and "object
name must match what the data hashes down to" is a basic requirement
for that.

Thanks.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-06-08 18:05                         ` Junio C Hamano
@ 2016-06-08 19:00                           ` Jeff King
  0 siblings, 0 replies; 37+ messages in thread
From: Jeff King @ 2016-06-08 19:00 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Christian Couder, Duy Nguyen, git

On Wed, Jun 08, 2016 at 11:05:20AM -0700, Junio C Hamano wrote:

> > Likewise, I'm not sure if "get" should be allowed to return contents
> > that don't match the sha1.
> 
> Yes, this is what I was getting at.  It would be ideal to come up
> with a way to do the large-blob offload without resorting to hacks
> (like LFS and annex where "the same object contents will always
> result in the same object name" is deliberately broken), and "object
> name must match what the data hashes down to" is a basic requirement
> for that.

I meant to elaborate here more, but it looks like I didn't.

One thing that an external odb command might want to be able to do is
say "I _do_ have that object, but it would be expensive or impossible to
get right now, so I will give you a placeholder" (e.g., you are just
trying to run "git log" while on an airplane, and you would not want to
die() because you cannot fetch some blob).

But the right way is not to have "get" send content that does not match
the requested sha1. It needs to make git aware that the object is a
placeholder, so git does not do stupid things like write the bogus
content into a loose object.

The right way may be as simple as the external odb returning a non-zero
exit code, and git fills in the placeholder data itself (or dies,
possibly, depending on what the user asks it to do).

One of the reasons I worked up that initial external-odb patch was
because I knew that before we settled on a definite interface, we would
have to try it out and see what odd corner cases came up. E.g., when do
we fault in objects in a way that's annoying to the user? Which code
paths need to handle "we do have this object available, but you can't
see it right now, so what kind of fallback can we do?". Etc.

Unfortunately I never actually did any of that testing. ;)

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-06-08 16:19                           ` Jeff King
@ 2016-06-09  8:53                             ` Duy Nguyen
  2016-06-09 17:23                               ` Jeff King
  0 siblings, 1 reply; 37+ messages in thread
From: Duy Nguyen @ 2016-06-09  8:53 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Christian Couder, git

On Wed, Jun 8, 2016 at 11:19 PM, Jeff King <peff@peff.net> wrote:
> On Wed, Jun 08, 2016 at 05:44:06PM +0700, Duy Nguyen wrote:
>
>> On Wed, Jun 8, 2016 at 3:23 AM, Jeff King <peff@peff.net> wrote:
>> > Because this "external odb" essentially acts as a git alternate, we
>> > would hit it only when we couldn't find an object through regular means.
>> > Git would then make the object available in the usual on-disk format
>> > (probably as a loose object).
>>
>> This means git-gc (and all things that do rev-list --objects --all)
>> would download at least all trees and commits? Or will we have special
>> treatment for those commands?
>
> Yes. To me, this was always about punting large blobs from the clones.
> Basically the way git-lfs and other tools work, but without munging your
> history permanently.

Makes sense. If we keep all trees and commits locally, pack v4 still
has a chance to rise!

> I don't know if Christian had other cases in mind (like the many-files
> case, which I think is better served by something like narrow clones).

Although for git-gc or git-fsck, I guess we need special support
anyway not to download large blobs unnecessarily. Not sure if git-gc
can already do that now. All I remember is git-repack can still be
used to make a repo independent from odb alternates. We probably want
to avoid that. git-fsck definitely should verify that large remote
blobs are good without downloading them (a new "fsck" command to
external odb, maybe).
-- 
Duy

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH v2 4/4] bundle v3: the beginning
  2016-06-09  8:53                             ` Duy Nguyen
@ 2016-06-09 17:23                               ` Jeff King
  0 siblings, 0 replies; 37+ messages in thread
From: Jeff King @ 2016-06-09 17:23 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Junio C Hamano, Christian Couder, git

On Thu, Jun 09, 2016 at 03:53:26PM +0700, Duy Nguyen wrote:

> > Yes. To me, this was always about punting large blobs from the clones.
> > Basically the way git-lfs and other tools work, but without munging your
> > history permanently.
> 
> Makes sense. If we keep all trees and commits locally, pack v4 still
> has a chance to rise!

Yeah, I don't think anything here precludes pack v4.

> > I don't know if Christian had other cases in mind (like the many-files
> > case, which I think is better served by something like narrow clones).
> 
> Although for git-gc or git-fsck, I guess we need special support
> anyway not to download large blobs unnecessarily. Not sure if git-gc
> can already do that now. All I remember is git-repack can still be
> used to make a repo independent from odb alternates. We probably want
> to avoid that. git-fsck definitely should verify that large remote
> blobs are good without downloading them (a new "fsck" command to
> external odb, maybe).

I think git-gc should work out of the box; you'd want to use "repack
-l", which git-gc passes already.

Fsck would be OK as long as you didn't actually load blobs. We have
--connectivity-only for that, but of course it isn't the default. You'd
probably want the default mode to fsck local blobs, but _not_ to fault
in external blobs (but have an option to fault them all in if you really
wanted to be sure you have everything).

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2016-06-09 17:24 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-01 23:35 [PATCH 1/2] bundle: plug resource leak Junio C Hamano
2016-03-01 23:36 ` [PATCH 2/2] bundle: keep a copy of bundle file name in the in-core bundle header Junio C Hamano
2016-03-02  9:01   ` Jeff King
2016-03-02 18:15     ` Junio C Hamano
2016-03-02 20:32       ` [PATCH v2 0/4] "split bundle" preview Junio C Hamano
2016-03-02 20:32         ` [PATCH v2 1/4] bundle doc: 'verify' is not about verifying the bundle Junio C Hamano
2016-03-02 20:32         ` [PATCH v2 2/4] bundle: plug resource leak Junio C Hamano
2016-03-02 20:32         ` [PATCH v2 3/4] bundle: keep a copy of bundle file name in the in-core bundle header Junio C Hamano
2016-03-02 20:49           ` Jeff King
2016-03-02 20:32         ` [PATCH v2 4/4] bundle v3: the beginning Junio C Hamano
2016-03-03  1:36           ` Duy Nguyen
2016-03-03  2:57             ` Junio C Hamano
2016-03-03  5:15               ` Duy Nguyen
2016-05-20 12:39           ` Christian Couder
2016-05-31 12:43             ` Duy Nguyen
2016-05-31 13:18               ` Christian Couder
2016-06-01 13:37                 ` Duy Nguyen
2016-06-07 14:49                   ` Christian Couder
2016-06-01 14:00                 ` Duy Nguyen
2016-06-07  8:46                   ` Christian Couder
2016-06-07  8:53                     ` Mike Hommey
2016-06-07 10:22                     ` Duy Nguyen
2016-06-07 19:23                     ` Junio C Hamano
2016-06-07 20:23                       ` Jeff King
2016-06-08 10:44                         ` Duy Nguyen
2016-06-08 16:19                           ` Jeff King
2016-06-09  8:53                             ` Duy Nguyen
2016-06-09 17:23                               ` Jeff King
2016-06-08 18:05                         ` Junio C Hamano
2016-06-08 19:00                           ` Jeff King
2016-05-31 22:23               ` Jeff King
2016-05-31 22:31             ` Jeff King
2016-06-07 13:19               ` Christian Couder
2016-06-07 20:35                 ` Jeff King
2016-03-02  8:54 ` [PATCH 1/2] bundle: plug resource leak Jeff King
2016-03-02  9:00   ` Junio C Hamano
2016-03-02  9:02     ` Jeff King

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).