git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* [PATCH] clone, submodule: pass partial clone filters to submodules
@ 2022-01-21  3:32 Josh Steadmon
  2022-01-22  1:49 ` Junio C Hamano
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Josh Steadmon @ 2022-01-21  3:32 UTC (permalink / raw)
  To: git

When cloning a repo with a --filter and with --recurse-submodules
enabled, the partial clone filter only applies to
the top-level repo. This can lead to unexpected bandwidth and disk
usage for projects which include large submodules. For example, a user
might wish to make a partial clone of Gerrit and would run:
`git clone --recurse-submodules --filter=blob:5k
https://gerrit.googlesource.com/gerrit`. However, only the superproject
would be a partial clone; all the submodules would have all blobs
downloaded regardless of their size. With this change, the same filter
applies to submodules, meaning the expected bandwidth and disk savings
apply consistently.

Plumb the --filter argument from git-clone through git-submodule and
git-submodule--helper, such that submodule clones also have the filter
applied.

This applies the same filter to the superproject and all submodules.
Users who prefer the current behavior (i.e., a filter only on the
superproject) would need to clone with `--no-recurse-submodules` and
then manually initialize each submodule.

Applying filters to submodules should be safe thanks to Jonathan Tan's
recent work [1, 2, 3] eliminating the use of alternates as a method of
accessing submodule objects, so any submodule object access now triggers
a lazy fetch from the submodule's promisor remote if the accessed object
is missing. This patch is an updated version of [4], which was created
prior to Jonathan Tan's work.

[1]: 8721e2e (Merge branch 'jt/partial-clone-submodule-1', 2021-07-16)
[2]: 11e5d0a (Merge branch 'jt/grep-wo-submodule-odb-as-alternate',
	2021-09-20)
[3]: 162a13b (Merge branch 'jt/no-abuse-alternate-odb-for-submodules',
	2021-10-25)
[4]: https://lore.kernel.org/git/52bf9d45b8e2b72ff32aa773f2415bf7b2b86da2.1563322192.git.steadmon@google.com/

Signed-off-by: Josh Steadmon <steadmon@google.com>
---
 builtin/clone.c                    |  4 ++++
 builtin/submodule--helper.c        | 30 ++++++++++++++++++++++---
 git-submodule.sh                   | 17 +++++++++++++-
 t/t5617-clone-submodules-remote.sh | 17 ++++++++++++++
 t/t7814-grep-recurse-submodules.sh | 36 ++++++++++++++++++++++++++++++
 5 files changed, 100 insertions(+), 4 deletions(-)

diff --git a/builtin/clone.c b/builtin/clone.c
index 727e16e0ae..196e947714 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -729,6 +729,10 @@ static int checkout(int submodule_progress)
 			strvec_push(&args, "--no-fetch");
 		}
 
+		if (filter_options.choice)
+			strvec_pushf(&args, "--filter=%s",
+				     expand_list_objects_filter_spec(&filter_options));
+
 		if (option_single_branch >= 0)
 			strvec_push(&args, option_single_branch ?
 					       "--single-branch" :
diff --git a/builtin/submodule--helper.c b/builtin/submodule--helper.c
index c5d3fc3817..11552970f2 100644
--- a/builtin/submodule--helper.c
+++ b/builtin/submodule--helper.c
@@ -20,6 +20,7 @@
 #include "diff.h"
 #include "object-store.h"
 #include "advice.h"
+#include "list-objects-filter-options.h"
 
 #define OPT_QUIET (1 << 0)
 #define OPT_CACHED (1 << 1)
@@ -1630,6 +1631,7 @@ struct module_clone_data {
 	const char *name;
 	const char *url;
 	const char *depth;
+	struct list_objects_filter_options *filter_options;
 	struct string_list reference;
 	unsigned int quiet: 1;
 	unsigned int progress: 1;
@@ -1796,6 +1798,10 @@ static int clone_submodule(struct module_clone_data *clone_data)
 			strvec_push(&cp.args, "--dissociate");
 		if (sm_gitdir && *sm_gitdir)
 			strvec_pushl(&cp.args, "--separate-git-dir", sm_gitdir, NULL);
+		if (clone_data->filter_options && clone_data->filter_options->choice)
+			strvec_pushf(&cp.args, "--filter=%s",
+				     expand_list_objects_filter_spec(
+					     clone_data->filter_options));
 		if (clone_data->single_branch >= 0)
 			strvec_push(&cp.args, clone_data->single_branch ?
 				    "--single-branch" :
@@ -1852,6 +1858,7 @@ static int module_clone(int argc, const char **argv, const char *prefix)
 {
 	int dissociate = 0, quiet = 0, progress = 0, require_init = 0;
 	struct module_clone_data clone_data = MODULE_CLONE_DATA_INIT;
+	struct list_objects_filter_options filter_options;
 
 	struct option module_clone_options[] = {
 		OPT_STRING(0, "prefix", &clone_data.prefix,
@@ -1881,17 +1888,19 @@ static int module_clone(int argc, const char **argv, const char *prefix)
 			   N_("disallow cloning into non-empty directory")),
 		OPT_BOOL(0, "single-branch", &clone_data.single_branch,
 			 N_("clone only one branch, HEAD or --branch")),
+		OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
 		OPT_END()
 	};
 
 	const char *const git_submodule_helper_usage[] = {
 		N_("git submodule--helper clone [--prefix=<path>] [--quiet] "
 		   "[--reference <repository>] [--name <name>] [--depth <depth>] "
-		   "[--single-branch] "
+		   "[--single-branch] [--filter <filter-spec>]"
 		   "--url <url> --path <path>"),
 		NULL
 	};
 
+	memset(&filter_options, 0, sizeof(filter_options));
 	argc = parse_options(argc, argv, prefix, module_clone_options,
 			     git_submodule_helper_usage, 0);
 
@@ -1899,12 +1908,14 @@ static int module_clone(int argc, const char **argv, const char *prefix)
 	clone_data.quiet = !!quiet;
 	clone_data.progress = !!progress;
 	clone_data.require_init = !!require_init;
+	clone_data.filter_options = &filter_options;
 
 	if (argc || !clone_data.url || !clone_data.path || !*(clone_data.path))
 		usage_with_options(git_submodule_helper_usage,
 				   module_clone_options);
 
 	clone_submodule(&clone_data);
+	list_objects_filter_release(&filter_options);
 	return 0;
 }
 
@@ -1994,6 +2005,7 @@ struct submodule_update_clone {
 	const char *recursive_prefix;
 	const char *prefix;
 	int single_branch;
+	struct list_objects_filter_options *filter_options;
 
 	/* to be consumed by git-submodule.sh */
 	struct update_clone_data *update_clone;
@@ -2154,6 +2166,9 @@ static int prepare_to_clone_next_submodule(const struct cache_entry *ce,
 		strvec_pushl(&child->args, "--prefix", suc->prefix, NULL);
 	if (suc->recommend_shallow && sub->recommend_shallow == 1)
 		strvec_push(&child->args, "--depth=1");
+	if (suc->filter_options && suc->filter_options->choice)
+		strvec_pushf(&child->args, "--filter=%s",
+			     expand_list_objects_filter_spec(suc->filter_options));
 	if (suc->require_init)
 		strvec_push(&child->args, "--require-init");
 	strvec_pushl(&child->args, "--path", sub->path, NULL);
@@ -2498,6 +2513,8 @@ static int update_clone(int argc, const char **argv, const char *prefix)
 	const char *update = NULL;
 	struct pathspec pathspec;
 	struct submodule_update_clone suc = SUBMODULE_UPDATE_CLONE_INIT;
+	struct list_objects_filter_options filter_options;
+	int ret;
 
 	struct option module_update_clone_options[] = {
 		OPT_STRING(0, "prefix", &prefix,
@@ -2528,6 +2545,7 @@ static int update_clone(int argc, const char **argv, const char *prefix)
 			   N_("disallow cloning into non-empty directory")),
 		OPT_BOOL(0, "single-branch", &suc.single_branch,
 			 N_("clone only one branch, HEAD or --branch")),
+		OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
 		OPT_END()
 	};
 
@@ -2540,20 +2558,26 @@ static int update_clone(int argc, const char **argv, const char *prefix)
 	update_clone_config_from_gitmodules(&suc.max_jobs);
 	git_config(git_update_clone_config, &suc.max_jobs);
 
+	memset(&filter_options, 0, sizeof(filter_options));
 	argc = parse_options(argc, argv, prefix, module_update_clone_options,
 			     git_submodule_helper_usage, 0);
+	suc.filter_options = &filter_options;
 
 	if (update)
 		if (parse_submodule_update_strategy(update, &suc.update) < 0)
 			die(_("bad value for update parameter"));
 
-	if (module_list_compute(argc, argv, prefix, &pathspec, &suc.list) < 0)
+	if (module_list_compute(argc, argv, prefix, &pathspec, &suc.list) < 0) {
+		list_objects_filter_release(&filter_options);
 		return 1;
+	}
 
 	if (pathspec.nr)
 		suc.warn_if_uninitialized = 1;
 
-	return update_submodules(&suc);
+	ret = update_submodules(&suc);
+	list_objects_filter_release(&filter_options);
+	return ret;
 }
 
 static int run_update_procedure(int argc, const char **argv, const char *prefix)
diff --git a/git-submodule.sh b/git-submodule.sh
index 652861aa66..926f6873d3 100755
--- a/git-submodule.sh
+++ b/git-submodule.sh
@@ -10,7 +10,7 @@ USAGE="[--quiet] [--cached]
    or: $dashless [--quiet] status [--cached] [--recursive] [--] [<path>...]
    or: $dashless [--quiet] init [--] [<path>...]
    or: $dashless [--quiet] deinit [-f|--force] (--all| [--] <path>...)
-   or: $dashless [--quiet] update [--init] [--remote] [-N|--no-fetch] [-f|--force] [--checkout|--merge|--rebase] [--[no-]recommend-shallow] [--reference <repository>] [--recursive] [--[no-]single-branch] [--] [<path>...]
+   or: $dashless [--quiet] update [--init [--filter=<filter-spec>]] [--remote] [-N|--no-fetch] [-f|--force] [--checkout|--merge|--rebase] [--[no-]recommend-shallow] [--reference <repository>] [--recursive] [--[no-]single-branch] [--] [<path>...]
    or: $dashless [--quiet] set-branch (--default|--branch <branch>) [--] <path>
    or: $dashless [--quiet] set-url [--] <path> <newurl>
    or: $dashless [--quiet] summary [--cached|--files] [--summary-limit <n>] [commit] [--] [<path>...]
@@ -49,6 +49,7 @@ dissociate=
 single_branch=
 jobs=
 recommend_shallow=
+filter=
 
 die_if_unmatched ()
 {
@@ -347,6 +348,14 @@ cmd_update()
 		--no-single-branch)
 			single_branch="--no-single-branch"
 			;;
+		--filter)
+			case "$2" in '') usage ;; esac
+			filter="--filter=$2"
+			shift
+			;;
+		--filter=*)
+			filter=$1
+			;;
 		--)
 			shift
 			break
@@ -361,6 +370,11 @@ cmd_update()
 		shift
 	done
 
+	if test -n "$filter" && test "$init" != "1"
+	then
+		usage
+	fi
+
 	if test -n "$init"
 	then
 		cmd_init "--" "$@" || return
@@ -379,6 +393,7 @@ cmd_update()
 		$single_branch \
 		$recommend_shallow \
 		$jobs \
+		$filter \
 		-- \
 		"$@" || echo "#unmatched" $?
 	} | {
diff --git a/t/t5617-clone-submodules-remote.sh b/t/t5617-clone-submodules-remote.sh
index e2dbb4eaba..bc4fa11d51 100755
--- a/t/t5617-clone-submodules-remote.sh
+++ b/t/t5617-clone-submodules-remote.sh
@@ -28,6 +28,13 @@ test_expect_success 'setup' '
 	)
 '
 
+# bare clone giving "srv.bare" for use as our server.
+test_expect_success 'setup bare clone for server' '
+	git clone --bare "file://$(pwd)/." srv.bare &&
+	git -C srv.bare config --local uploadpack.allowfilter 1 &&
+	git -C srv.bare config --local uploadpack.allowanysha1inwant 1
+'
+
 test_expect_success 'clone with --no-remote-submodules' '
 	test_when_finished "rm -rf super_clone" &&
 	git clone --recurse-submodules --no-remote-submodules "file://$pwd/." super_clone &&
@@ -65,4 +72,14 @@ test_expect_success 'clone with --single-branch' '
 	)
 '
 
+# do basic partial clone from "srv.bare"
+# confirm partial clone was registered in the local config for super and sub.
+test_expect_success 'clone with --filter' '
+	git clone --recurse-submodules --filter blob:none "file://$pwd/srv.bare" super_clone &&
+	test_cmp_config -C super_clone 1 core.repositoryformatversion &&
+	test_cmp_config -C super_clone blob:none remote.origin.partialclonefilter &&
+	test_cmp_config -C super_clone/sub 1 core.repositoryformatversion &&
+	test_cmp_config -C super_clone/sub blob:none remote.origin.partialclonefilter
+'
+
 test_done
diff --git a/t/t7814-grep-recurse-submodules.sh b/t/t7814-grep-recurse-submodules.sh
index 058e5d0c96..f7452af262 100755
--- a/t/t7814-grep-recurse-submodules.sh
+++ b/t/t7814-grep-recurse-submodules.sh
@@ -544,4 +544,40 @@ test_expect_failure 'grep saves textconv cache in the appropriate repository' '
 	test_path_is_file "$sub_textconv_cache"
 '
 
+test_expect_success 'grep partially-cloned submodule' '
+	# Set up clean superproject and submodule for partial cloning.
+	git init super &&
+	git init super/sub &&
+	(
+		cd super &&
+		test_commit --no-tag "Add file in superproject" super-file "Some content for super-file" &&
+		test_commit -C sub --no-tag "Add file in submodule" sub-file "Some content for sub-file" &&
+		git submodule add ./sub &&
+		git commit -m "Add other as submodule sub" &&
+		test_tick &&
+		test_commit -C sub --no-tag --append "Update file in submodule" sub-file "Some more content for sub-file" &&
+		git add sub &&
+		git commit -m "Update submodule" &&
+		test_tick &&
+		git config --local uploadpack.allowfilter 1 &&
+		git config --local uploadpack.allowanysha1inwant 1 &&
+		git -C sub config --local uploadpack.allowfilter 1 &&
+		git -C sub config --local uploadpack.allowanysha1inwant 1
+	) &&
+	# Clone the superproject & submodule, then make sure we can lazy-fetch submodule objects.
+	git clone --filter=blob:none --recurse-submodules "file://$(pwd)/super" partial &&
+	(
+		cd partial &&
+		cat >expect <<-\EOF &&
+		HEAD^:sub/sub-file:Some content for sub-file
+		HEAD^:super-file:Some content for super-file
+		EOF
+
+		GIT_TRACE2_EVENT="$(pwd)/trace2.log" git grep -e content --recurse-submodules HEAD^ >actual &&
+		test_cmp expect actual &&
+		# Verify that we actually fetched data from the promisor remote:
+		grep \"category\":\"promisor\",\"key\":\"fetch_count\",\"value\":\"1\" trace2.log >/dev/null
+	)
+'
+
 test_done

base-commit: b56bd95bbc8f716cb8cbb5fdc18b9b0f00323c6a
-- 
2.35.0.rc0.227.g00780c9af4-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] clone, submodule: pass partial clone filters to submodules
  2022-01-21  3:32 [PATCH] clone, submodule: pass partial clone filters to submodules Josh Steadmon
@ 2022-01-22  1:49 ` Junio C Hamano
  2022-01-25 21:00   ` Elijah Newren
  2022-01-25 21:08 ` Elijah Newren
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 14+ messages in thread
From: Junio C Hamano @ 2022-01-22  1:49 UTC (permalink / raw)
  To: Josh Steadmon; +Cc: git

Josh Steadmon <steadmon@google.com> writes:

> When cloning a repo with a --filter and with --recurse-submodules
> enabled, the partial clone filter only applies to
> the top-level repo. This can lead to unexpected bandwidth and disk
> usage for projects which include large submodules. For example, a user
> might wish to make a partial clone of Gerrit and would run:
> `git clone --recurse-submodules --filter=blob:5k
> https://gerrit.googlesource.com/gerrit`. However, only the superproject
> would be a partial clone; all the submodules would have all blobs
> downloaded regardless of their size. With this change, the same filter
> applies to submodules, meaning the expected bandwidth and disk savings
> apply consistently.
>
> Plumb the --filter argument from git-clone through git-submodule and
> git-submodule--helper, such that submodule clones also have the filter
> applied.
>
> This applies the same filter to the superproject and all submodules.
> Users who prefer the current behavior (i.e., a filter only on the
> superproject) would need to clone with `--no-recurse-submodules` and
> then manually initialize each submodule.

Two concerns (I do not say "issues", because I honestly do not know
how much this will hurt in the future).

 - Obviously, this changes the end user experience.  To users in the
   scenario that motivated this change (described above), obviously
   it is a change in a good way, and but I wonder if there are
   workflows that are hurt and actually have to resort to the
   workaround to preserve the current behaviour.

 - Passing the filter down to submodules means that the filter
   settings are universal across projects.  The current set of
   filters, I do not think such an assumption is too bad.  If 5k
   blob is too large for the top-level superproject, it is OK for
   the superproject to dictate that 5k blob is too large for any of
   the submodules the superproject uses.  But can we forever limit
   the filter vocabulary to the ones that can sensibly be applied
   recursively?  If we had a filter that goes with pathnames
   (e.g. "I only want src/ and test/ directories and nothing else
   initially"), such a set of pathnames appropriate in the context
   of the superproject is unlikely to apply to its submodules.  Even
   the existing "depth" filter is iffy, if a toplevel superproject
   is fairly flat and one of the submodules has a directory
   hierarchy that is ultra deep.

Will queue and wait for others to comment.

Thanks.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] clone, submodule: pass partial clone filters to submodules
  2022-01-22  1:49 ` Junio C Hamano
@ 2022-01-25 21:00   ` Elijah Newren
  2022-01-26  6:03     ` Junio C Hamano
  0 siblings, 1 reply; 14+ messages in thread
From: Elijah Newren @ 2022-01-25 21:00 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Josh Steadmon, Git Mailing List

On Sat, Jan 22, 2022 at 5:36 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> Josh Steadmon <steadmon@google.com> writes:
>
> > When cloning a repo with a --filter and with --recurse-submodules
> > enabled, the partial clone filter only applies to
> > the top-level repo. This can lead to unexpected bandwidth and disk
> > usage for projects which include large submodules. For example, a user
> > might wish to make a partial clone of Gerrit and would run:
> > `git clone --recurse-submodules --filter=blob:5k
> > https://gerrit.googlesource.com/gerrit`. However, only the superproject
> > would be a partial clone; all the submodules would have all blobs
> > downloaded regardless of their size. With this change, the same filter
> > applies to submodules, meaning the expected bandwidth and disk savings
> > apply consistently.
> >
> > Plumb the --filter argument from git-clone through git-submodule and
> > git-submodule--helper, such that submodule clones also have the filter
> > applied.
> >
> > This applies the same filter to the superproject and all submodules.
> > Users who prefer the current behavior (i.e., a filter only on the
> > superproject) would need to clone with `--no-recurse-submodules` and
> > then manually initialize each submodule.
>
> Two concerns (I do not say "issues", because I honestly do not know
> how much this will hurt in the future).
>
>  - Obviously, this changes the end user experience.  To users in the
>    scenario that motivated this change (described above), obviously
>    it is a change in a good way, and but I wonder if there are
>    workflows that are hurt and actually have to resort to the
>    workaround to preserve the current behaviour.
>
>  - Passing the filter down to submodules means that the filter
>    settings are universal across projects.  The current set of
>    filters, I do not think such an assumption is too bad.  If 5k
>    blob is too large for the top-level superproject, it is OK for
>    the superproject to dictate that 5k blob is too large for any of
>    the submodules the superproject uses.  But can we forever limit
>    the filter vocabulary to the ones that can sensibly be applied
>    recursively?  If we had a filter that goes with pathnames
>    (e.g. "I only want src/ and test/ directories and nothing else
>    initially"), such a set of pathnames appropriate in the context
>    of the superproject is unlikely to apply to its submodules.  Even
>    the existing "depth" filter is iffy, if a toplevel superproject
>    is fairly flat and one of the submodules has a directory
>    hierarchy that is ultra deep.

This second item matches the concern I wanted to raise.  I've wanted
to do "sparse clones" for over a decade[*].  In my opinion,
disconnected development should not require a full clone or constant
network connectivity.  We're inching closer to this goal: (1) partial
clones provide some of the base ability for partial downloads
augmented by updates as needed, (2) sparse-checkout makes the working
tree handle a subset so we can work without downloading more, (3)
cone-mode corrects the mistake of specifying paths via gitignore-style
patterns, and (4) merge-ort has some huge wins for partial clones to
avoid downloading objects by both avoiding unnecessary rename
detections and avoiding traversing into unmodified directories.

There's a number of next steps, of course, one of which is that I
would really like to add a path-based filter (corresponding to the
directories in the sparsity cone) so that the initial partial clone
can get all commits, all trees, and the paths within the sparsity
cone.

But how would that interact with this patch?  There's a bit of a
conflict.  There's a few ways out:
  * Make your change be explicit rather than implicit: Based on
Junio's first concern, you could modify this patch so that it requires
a new flag like --filter-submodules-too (or some better name), and
perhaps folks with a path filter just wouldn't use that.
  * Make these incompatible: Maybe a path filter is incompatible with
--recurse-submodules, and we should throw an error if both are
specified.
  * Attempt to marry the two options: Each submodule could perhaps
extract the subset of paths with itself as a leading directory and
remove that leading prefix and then use that as the path portion of
the filter.  (And perhaps even taking this a step farther: each level
of cloning will only recurse into submodules which match the specified
paths).

I'm inclined to prefer an explicit flag for this behavior, but I feel
bad asking for that due to behavior and code that doesn't exist and
isn't even being worked on yet.  If your patch does go through as-is,
I'd probably implement the
make-path-filters-and-recurse-submodules-incompatible option when
adding path based filters.


[*] https://lore.kernel.org/git/AANLkTikJhSVJw2hXkp0j6XA+k-J-AtSYzKWumjnqqsgz@mail.gmail.com/,
the old code may be dead but the dream isn't

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] clone, submodule: pass partial clone filters to submodules
  2022-01-21  3:32 [PATCH] clone, submodule: pass partial clone filters to submodules Josh Steadmon
  2022-01-22  1:49 ` Junio C Hamano
@ 2022-01-25 21:08 ` Elijah Newren
  2022-02-01 21:34   ` Josh Steadmon
  2022-02-05  0:40 ` [PATCH v2] " Josh Steadmon
  2022-02-05  5:00 ` [PATCH v3] " Josh Steadmon
  3 siblings, 1 reply; 14+ messages in thread
From: Elijah Newren @ 2022-01-25 21:08 UTC (permalink / raw)
  To: Josh Steadmon; +Cc: Git Mailing List

On Fri, Jan 21, 2022 at 4:31 PM Josh Steadmon <steadmon@google.com> wrote:
>
> When cloning a repo with a --filter and with --recurse-submodules
> enabled, the partial clone filter only applies to
> the top-level repo. This can lead to unexpected bandwidth and disk
> usage for projects which include large submodules. For example, a user
> might wish to make a partial clone of Gerrit and would run:
> `git clone --recurse-submodules --filter=blob:5k
> https://gerrit.googlesource.com/gerrit`. However, only the superproject
> would be a partial clone; all the submodules would have all blobs
> downloaded regardless of their size. With this change, the same filter
> applies to submodules, meaning the expected bandwidth and disk savings
> apply consistently.
>
> Plumb the --filter argument from git-clone through git-submodule and
> git-submodule--helper, such that submodule clones also have the filter
> applied.
>
> This applies the same filter to the superproject and all submodules.
> Users who prefer the current behavior (i.e., a filter only on the
> superproject) would need to clone with `--no-recurse-submodules` and
> then manually initialize each submodule.
>
> Applying filters to submodules should be safe thanks to Jonathan Tan's
> recent work [1, 2, 3] eliminating the use of alternates as a method of
> accessing submodule objects, so any submodule object access now triggers
> a lazy fetch from the submodule's promisor remote if the accessed object
> is missing. This patch is an updated version of [4], which was created
> prior to Jonathan Tan's work.
>
> [1]: 8721e2e (Merge branch 'jt/partial-clone-submodule-1', 2021-07-16)
> [2]: 11e5d0a (Merge branch 'jt/grep-wo-submodule-odb-as-alternate',
>         2021-09-20)
> [3]: 162a13b (Merge branch 'jt/no-abuse-alternate-odb-for-submodules',
>         2021-10-25)
> [4]: https://lore.kernel.org/git/52bf9d45b8e2b72ff32aa773f2415bf7b2b86da2.1563322192.git.steadmon@google.com/
>
> Signed-off-by: Josh Steadmon <steadmon@google.com>
> ---
>  builtin/clone.c                    |  4 ++++
>  builtin/submodule--helper.c        | 30 ++++++++++++++++++++++---
>  git-submodule.sh                   | 17 +++++++++++++-
>  t/t5617-clone-submodules-remote.sh | 17 ++++++++++++++
>  t/t7814-grep-recurse-submodules.sh | 36 ++++++++++++++++++++++++++++++
>  5 files changed, 100 insertions(+), 4 deletions(-)
>
> diff --git a/builtin/clone.c b/builtin/clone.c
> index 727e16e0ae..196e947714 100644
> --- a/builtin/clone.c
> +++ b/builtin/clone.c
> @@ -729,6 +729,10 @@ static int checkout(int submodule_progress)
>                         strvec_push(&args, "--no-fetch");
>                 }
>
> +               if (filter_options.choice)
> +                       strvec_pushf(&args, "--filter=%s",
> +                                    expand_list_objects_filter_spec(&filter_options));
> +
>                 if (option_single_branch >= 0)
>                         strvec_push(&args, option_single_branch ?
>                                                "--single-branch" :
> diff --git a/builtin/submodule--helper.c b/builtin/submodule--helper.c
> index c5d3fc3817..11552970f2 100644
> --- a/builtin/submodule--helper.c
> +++ b/builtin/submodule--helper.c
> @@ -20,6 +20,7 @@
>  #include "diff.h"
>  #include "object-store.h"
>  #include "advice.h"
> +#include "list-objects-filter-options.h"
>
>  #define OPT_QUIET (1 << 0)
>  #define OPT_CACHED (1 << 1)
> @@ -1630,6 +1631,7 @@ struct module_clone_data {
>         const char *name;
>         const char *url;
>         const char *depth;
> +       struct list_objects_filter_options *filter_options;
>         struct string_list reference;
>         unsigned int quiet: 1;
>         unsigned int progress: 1;
> @@ -1796,6 +1798,10 @@ static int clone_submodule(struct module_clone_data *clone_data)
>                         strvec_push(&cp.args, "--dissociate");
>                 if (sm_gitdir && *sm_gitdir)
>                         strvec_pushl(&cp.args, "--separate-git-dir", sm_gitdir, NULL);
> +               if (clone_data->filter_options && clone_data->filter_options->choice)
> +                       strvec_pushf(&cp.args, "--filter=%s",
> +                                    expand_list_objects_filter_spec(
> +                                            clone_data->filter_options));
>                 if (clone_data->single_branch >= 0)
>                         strvec_push(&cp.args, clone_data->single_branch ?
>                                     "--single-branch" :
> @@ -1852,6 +1858,7 @@ static int module_clone(int argc, const char **argv, const char *prefix)
>  {
>         int dissociate = 0, quiet = 0, progress = 0, require_init = 0;
>         struct module_clone_data clone_data = MODULE_CLONE_DATA_INIT;
> +       struct list_objects_filter_options filter_options;
>
>         struct option module_clone_options[] = {
>                 OPT_STRING(0, "prefix", &clone_data.prefix,
> @@ -1881,17 +1888,19 @@ static int module_clone(int argc, const char **argv, const char *prefix)
>                            N_("disallow cloning into non-empty directory")),
>                 OPT_BOOL(0, "single-branch", &clone_data.single_branch,
>                          N_("clone only one branch, HEAD or --branch")),
> +               OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
>                 OPT_END()
>         };
>
>         const char *const git_submodule_helper_usage[] = {
>                 N_("git submodule--helper clone [--prefix=<path>] [--quiet] "
>                    "[--reference <repository>] [--name <name>] [--depth <depth>] "
> -                  "[--single-branch] "
> +                  "[--single-branch] [--filter <filter-spec>]"
>                    "--url <url> --path <path>"),
>                 NULL
>         };
>
> +       memset(&filter_options, 0, sizeof(filter_options));
>         argc = parse_options(argc, argv, prefix, module_clone_options,
>                              git_submodule_helper_usage, 0);
>
> @@ -1899,12 +1908,14 @@ static int module_clone(int argc, const char **argv, const char *prefix)
>         clone_data.quiet = !!quiet;
>         clone_data.progress = !!progress;
>         clone_data.require_init = !!require_init;
> +       clone_data.filter_options = &filter_options;
>
>         if (argc || !clone_data.url || !clone_data.path || !*(clone_data.path))
>                 usage_with_options(git_submodule_helper_usage,
>                                    module_clone_options);
>
>         clone_submodule(&clone_data);
> +       list_objects_filter_release(&filter_options);
>         return 0;
>  }
>
> @@ -1994,6 +2005,7 @@ struct submodule_update_clone {
>         const char *recursive_prefix;
>         const char *prefix;
>         int single_branch;
> +       struct list_objects_filter_options *filter_options;
>
>         /* to be consumed by git-submodule.sh */
>         struct update_clone_data *update_clone;
> @@ -2154,6 +2166,9 @@ static int prepare_to_clone_next_submodule(const struct cache_entry *ce,
>                 strvec_pushl(&child->args, "--prefix", suc->prefix, NULL);
>         if (suc->recommend_shallow && sub->recommend_shallow == 1)
>                 strvec_push(&child->args, "--depth=1");
> +       if (suc->filter_options && suc->filter_options->choice)
> +               strvec_pushf(&child->args, "--filter=%s",
> +                            expand_list_objects_filter_spec(suc->filter_options));
>         if (suc->require_init)
>                 strvec_push(&child->args, "--require-init");
>         strvec_pushl(&child->args, "--path", sub->path, NULL);
> @@ -2498,6 +2513,8 @@ static int update_clone(int argc, const char **argv, const char *prefix)
>         const char *update = NULL;
>         struct pathspec pathspec;
>         struct submodule_update_clone suc = SUBMODULE_UPDATE_CLONE_INIT;
> +       struct list_objects_filter_options filter_options;
> +       int ret;
>
>         struct option module_update_clone_options[] = {
>                 OPT_STRING(0, "prefix", &prefix,
> @@ -2528,6 +2545,7 @@ static int update_clone(int argc, const char **argv, const char *prefix)
>                            N_("disallow cloning into non-empty directory")),
>                 OPT_BOOL(0, "single-branch", &suc.single_branch,
>                          N_("clone only one branch, HEAD or --branch")),
> +               OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
>                 OPT_END()
>         };
>
> @@ -2540,20 +2558,26 @@ static int update_clone(int argc, const char **argv, const char *prefix)
>         update_clone_config_from_gitmodules(&suc.max_jobs);
>         git_config(git_update_clone_config, &suc.max_jobs);
>
> +       memset(&filter_options, 0, sizeof(filter_options));
>         argc = parse_options(argc, argv, prefix, module_update_clone_options,
>                              git_submodule_helper_usage, 0);
> +       suc.filter_options = &filter_options;
>
>         if (update)
>                 if (parse_submodule_update_strategy(update, &suc.update) < 0)
>                         die(_("bad value for update parameter"));
>
> -       if (module_list_compute(argc, argv, prefix, &pathspec, &suc.list) < 0)
> +       if (module_list_compute(argc, argv, prefix, &pathspec, &suc.list) < 0) {
> +               list_objects_filter_release(&filter_options);
>                 return 1;
> +       }
>
>         if (pathspec.nr)
>                 suc.warn_if_uninitialized = 1;
>
> -       return update_submodules(&suc);
> +       ret = update_submodules(&suc);
> +       list_objects_filter_release(&filter_options);
> +       return ret;
>  }
>
>  static int run_update_procedure(int argc, const char **argv, const char *prefix)
> diff --git a/git-submodule.sh b/git-submodule.sh
> index 652861aa66..926f6873d3 100755
> --- a/git-submodule.sh
> +++ b/git-submodule.sh
> @@ -10,7 +10,7 @@ USAGE="[--quiet] [--cached]
>     or: $dashless [--quiet] status [--cached] [--recursive] [--] [<path>...]
>     or: $dashless [--quiet] init [--] [<path>...]
>     or: $dashless [--quiet] deinit [-f|--force] (--all| [--] <path>...)
> -   or: $dashless [--quiet] update [--init] [--remote] [-N|--no-fetch] [-f|--force] [--checkout|--merge|--rebase] [--[no-]recommend-shallow] [--reference <repository>] [--recursive] [--[no-]single-branch] [--] [<path>...]
> +   or: $dashless [--quiet] update [--init [--filter=<filter-spec>]] [--remote] [-N|--no-fetch] [-f|--force] [--checkout|--merge|--rebase] [--[no-]recommend-shallow] [--reference <repository>] [--recursive] [--[no-]single-branch] [--] [<path>...]
>     or: $dashless [--quiet] set-branch (--default|--branch <branch>) [--] <path>
>     or: $dashless [--quiet] set-url [--] <path> <newurl>
>     or: $dashless [--quiet] summary [--cached|--files] [--summary-limit <n>] [commit] [--] [<path>...]
> @@ -49,6 +49,7 @@ dissociate=
>  single_branch=
>  jobs=
>  recommend_shallow=
> +filter=
>
>  die_if_unmatched ()
>  {
> @@ -347,6 +348,14 @@ cmd_update()
>                 --no-single-branch)
>                         single_branch="--no-single-branch"
>                         ;;
> +               --filter)
> +                       case "$2" in '') usage ;; esac
> +                       filter="--filter=$2"
> +                       shift
> +                       ;;
> +               --filter=*)
> +                       filter=$1

Shouldn't this be
     filter="$1"
?  I think it'd work currently without the quotes, but seeing the
discussion of special characters for --filter=combine in
git-rev-list(1) make me worry that this could become unsafe in the
future.

> +                       ;;
>                 --)
>                         shift
>                         break
> @@ -361,6 +370,11 @@ cmd_update()
>                 shift
>         done
>
> +       if test -n "$filter" && test "$init" != "1"
> +       then
> +               usage
> +       fi
> +
>         if test -n "$init"
>         then
>                 cmd_init "--" "$@" || return
> @@ -379,6 +393,7 @@ cmd_update()
>                 $single_branch \
>                 $recommend_shallow \
>                 $jobs \
> +               $filter \
>                 -- \
>                 "$@" || echo "#unmatched" $?
>         } | {
> diff --git a/t/t5617-clone-submodules-remote.sh b/t/t5617-clone-submodules-remote.sh
> index e2dbb4eaba..bc4fa11d51 100755
> --- a/t/t5617-clone-submodules-remote.sh
> +++ b/t/t5617-clone-submodules-remote.sh
> @@ -28,6 +28,13 @@ test_expect_success 'setup' '
>         )
>  '
>
> +# bare clone giving "srv.bare" for use as our server.
> +test_expect_success 'setup bare clone for server' '
> +       git clone --bare "file://$(pwd)/." srv.bare &&
> +       git -C srv.bare config --local uploadpack.allowfilter 1 &&
> +       git -C srv.bare config --local uploadpack.allowanysha1inwant 1
> +'
> +
>  test_expect_success 'clone with --no-remote-submodules' '
>         test_when_finished "rm -rf super_clone" &&
>         git clone --recurse-submodules --no-remote-submodules "file://$pwd/." super_clone &&
> @@ -65,4 +72,14 @@ test_expect_success 'clone with --single-branch' '
>         )
>  '
>
> +# do basic partial clone from "srv.bare"
> +# confirm partial clone was registered in the local config for super and sub.
> +test_expect_success 'clone with --filter' '
> +       git clone --recurse-submodules --filter blob:none "file://$pwd/srv.bare" super_clone &&
> +       test_cmp_config -C super_clone 1 core.repositoryformatversion &&
> +       test_cmp_config -C super_clone blob:none remote.origin.partialclonefilter &&
> +       test_cmp_config -C super_clone/sub 1 core.repositoryformatversion &&
> +       test_cmp_config -C super_clone/sub blob:none remote.origin.partialclonefilter
> +'
> +
>  test_done
> diff --git a/t/t7814-grep-recurse-submodules.sh b/t/t7814-grep-recurse-submodules.sh
> index 058e5d0c96..f7452af262 100755
> --- a/t/t7814-grep-recurse-submodules.sh
> +++ b/t/t7814-grep-recurse-submodules.sh
> @@ -544,4 +544,40 @@ test_expect_failure 'grep saves textconv cache in the appropriate repository' '
>         test_path_is_file "$sub_textconv_cache"
>  '
>
> +test_expect_success 'grep partially-cloned submodule' '
> +       # Set up clean superproject and submodule for partial cloning.
> +       git init super &&
> +       git init super/sub &&
> +       (
> +               cd super &&
> +               test_commit --no-tag "Add file in superproject" super-file "Some content for super-file" &&
> +               test_commit -C sub --no-tag "Add file in submodule" sub-file "Some content for sub-file" &&
> +               git submodule add ./sub &&
> +               git commit -m "Add other as submodule sub" &&
> +               test_tick &&
> +               test_commit -C sub --no-tag --append "Update file in submodule" sub-file "Some more content for sub-file" &&
> +               git add sub &&
> +               git commit -m "Update submodule" &&
> +               test_tick &&
> +               git config --local uploadpack.allowfilter 1 &&
> +               git config --local uploadpack.allowanysha1inwant 1 &&
> +               git -C sub config --local uploadpack.allowfilter 1 &&
> +               git -C sub config --local uploadpack.allowanysha1inwant 1
> +       ) &&
> +       # Clone the superproject & submodule, then make sure we can lazy-fetch submodule objects.
> +       git clone --filter=blob:none --recurse-submodules "file://$(pwd)/super" partial &&
> +       (
> +               cd partial &&
> +               cat >expect <<-\EOF &&
> +               HEAD^:sub/sub-file:Some content for sub-file
> +               HEAD^:super-file:Some content for super-file
> +               EOF
> +
> +               GIT_TRACE2_EVENT="$(pwd)/trace2.log" git grep -e content --recurse-submodules HEAD^ >actual &&
> +               test_cmp expect actual &&
> +               # Verify that we actually fetched data from the promisor remote:
> +               grep \"category\":\"promisor\",\"key\":\"fetch_count\",\"value\":\"1\" trace2.log >/dev/null
> +       )
> +'
> +
>  test_done
>
> base-commit: b56bd95bbc8f716cb8cbb5fdc18b9b0f00323c6a
> --
> 2.35.0.rc0.227.g00780c9af4-goog

I didn't spot anything else in the patch implementation, though I
commented in response to Junio about my concerns about how this might
interact with future filters.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] clone, submodule: pass partial clone filters to submodules
  2022-01-25 21:00   ` Elijah Newren
@ 2022-01-26  6:03     ` Junio C Hamano
  2022-02-01 21:33       ` Josh Steadmon
  0 siblings, 1 reply; 14+ messages in thread
From: Junio C Hamano @ 2022-01-26  6:03 UTC (permalink / raw)
  To: Elijah Newren; +Cc: Josh Steadmon, Git Mailing List

Elijah Newren <newren@gmail.com> writes:

> But how would that interact with this patch?  There's a bit of a
> conflict.  There's a few ways out:
>   * Make your change be explicit rather than implicit: Based on
> Junio's first concern, you could modify this patch so that it requires
> a new flag like --filter-submodules-too (or some better name), and
> perhaps folks with a path filter just wouldn't use that.

I would very much prefer this, given that this is a change of
default proposed by those who want a different default than the
status quo, even without the "how would we know it is sensible to
just pass down any and all filters?" issue.

>   * Make these incompatible: Maybe a path filter is incompatible with
> --recurse-submodules, and we should throw an error if both are
> specified.

Perhaps.  Or automatically filter out such an incompatible ones, but
of course, that would mean submodules are made less filtered than
top-level which is usually the other way around.

>   * Attempt to marry the two options: Each submodule could perhaps
> extract the subset of paths with itself as a leading directory and
> remove that leading prefix and then use that as the path portion of
> the filter.  (And perhaps even taking this a step farther: each level
> of cloning will only recurse into submodules which match the specified
> paths).

Yup, for some filters, passing them down may have a "natural"
translation, similar to adding the prefix to a pathspec element.
It would probably depend on the filter if there is such a natural
translation even exists, though.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] clone, submodule: pass partial clone filters to submodules
  2022-01-26  6:03     ` Junio C Hamano
@ 2022-02-01 21:33       ` Josh Steadmon
  0 siblings, 0 replies; 14+ messages in thread
From: Josh Steadmon @ 2022-02-01 21:33 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Elijah Newren, Git Mailing List

On 2022.01.25 22:03, Junio C Hamano wrote:
> Elijah Newren <newren@gmail.com> writes:
> 
> > But how would that interact with this patch?  There's a bit of a
> > conflict.  There's a few ways out:
> >   * Make your change be explicit rather than implicit: Based on
> > Junio's first concern, you could modify this patch so that it requires
> > a new flag like --filter-submodules-too (or some better name), and
> > perhaps folks with a path filter just wouldn't use that.
> 
> I would very much prefer this, given that this is a change of
> default proposed by those who want a different default than the
> status quo, even without the "how would we know it is sensible to
> just pass down any and all filters?" issue.

Yes, I think a new flag (and corresponding set-and-forget config option)
is the right approach. I'll include that in V2.

Out of curiosity, does the project have specific principles about
changing default behavior? For example, would it make sense to plan a
path for --this-new-flag to gradually become the default, or is that
something we'd only consider with a new major-version release?

Just thinking out loud: it should be possible for us at $job and other
people in similar situations with a managed Git installation to look
through traces and see how often people are using or not using a
particular flag or config option, and that could in theory guide such
decisions. On the other hand, I don't think that Git users at megacorps
are sufficiently representative of all Git users to be the basis for
justifying such a change, so we'd need outside feedback as well. Not
trying to suggest any particular agenda or approach here, just wondering
if others have thoughts on this topic.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] clone, submodule: pass partial clone filters to submodules
  2022-01-25 21:08 ` Elijah Newren
@ 2022-02-01 21:34   ` Josh Steadmon
  0 siblings, 0 replies; 14+ messages in thread
From: Josh Steadmon @ 2022-02-01 21:34 UTC (permalink / raw)
  To: Elijah Newren; +Cc: Git Mailing List

On 2022.01.25 13:08, Elijah Newren wrote:
> On Fri, Jan 21, 2022 at 4:31 PM Josh Steadmon <steadmon@google.com> wrote:
> >
> > When cloning a repo with a --filter and with --recurse-submodules
> > enabled, the partial clone filter only applies to
> > the top-level repo. This can lead to unexpected bandwidth and disk
> > usage for projects which include large submodules. For example, a user
> > might wish to make a partial clone of Gerrit and would run:
> > `git clone --recurse-submodules --filter=blob:5k
> > https://gerrit.googlesource.com/gerrit`. However, only the superproject
> > would be a partial clone; all the submodules would have all blobs
> > downloaded regardless of their size. With this change, the same filter
> > applies to submodules, meaning the expected bandwidth and disk savings
> > apply consistently.
> >
> > Plumb the --filter argument from git-clone through git-submodule and
> > git-submodule--helper, such that submodule clones also have the filter
> > applied.
> >
> > This applies the same filter to the superproject and all submodules.
> > Users who prefer the current behavior (i.e., a filter only on the
> > superproject) would need to clone with `--no-recurse-submodules` and
> > then manually initialize each submodule.
> >
> > Applying filters to submodules should be safe thanks to Jonathan Tan's
> > recent work [1, 2, 3] eliminating the use of alternates as a method of
> > accessing submodule objects, so any submodule object access now triggers
> > a lazy fetch from the submodule's promisor remote if the accessed object
> > is missing. This patch is an updated version of [4], which was created
> > prior to Jonathan Tan's work.
> >
> > [1]: 8721e2e (Merge branch 'jt/partial-clone-submodule-1', 2021-07-16)
> > [2]: 11e5d0a (Merge branch 'jt/grep-wo-submodule-odb-as-alternate',
> >         2021-09-20)
> > [3]: 162a13b (Merge branch 'jt/no-abuse-alternate-odb-for-submodules',
> >         2021-10-25)
> > [4]: https://lore.kernel.org/git/52bf9d45b8e2b72ff32aa773f2415bf7b2b86da2.1563322192.git.steadmon@google.com/
> >
> > Signed-off-by: Josh Steadmon <steadmon@google.com>
> > ---
> >  builtin/clone.c                    |  4 ++++
> >  builtin/submodule--helper.c        | 30 ++++++++++++++++++++++---
> >  git-submodule.sh                   | 17 +++++++++++++-
> >  t/t5617-clone-submodules-remote.sh | 17 ++++++++++++++
> >  t/t7814-grep-recurse-submodules.sh | 36 ++++++++++++++++++++++++++++++
> >  5 files changed, 100 insertions(+), 4 deletions(-)
> >
> > diff --git a/builtin/clone.c b/builtin/clone.c
> > index 727e16e0ae..196e947714 100644
> > --- a/builtin/clone.c
> > +++ b/builtin/clone.c
> > @@ -729,6 +729,10 @@ static int checkout(int submodule_progress)
> >                         strvec_push(&args, "--no-fetch");
> >                 }
> >
> > +               if (filter_options.choice)
> > +                       strvec_pushf(&args, "--filter=%s",
> > +                                    expand_list_objects_filter_spec(&filter_options));
> > +
> >                 if (option_single_branch >= 0)
> >                         strvec_push(&args, option_single_branch ?
> >                                                "--single-branch" :
> > diff --git a/builtin/submodule--helper.c b/builtin/submodule--helper.c
> > index c5d3fc3817..11552970f2 100644
> > --- a/builtin/submodule--helper.c
> > +++ b/builtin/submodule--helper.c
> > @@ -20,6 +20,7 @@
> >  #include "diff.h"
> >  #include "object-store.h"
> >  #include "advice.h"
> > +#include "list-objects-filter-options.h"
> >
> >  #define OPT_QUIET (1 << 0)
> >  #define OPT_CACHED (1 << 1)
> > @@ -1630,6 +1631,7 @@ struct module_clone_data {
> >         const char *name;
> >         const char *url;
> >         const char *depth;
> > +       struct list_objects_filter_options *filter_options;
> >         struct string_list reference;
> >         unsigned int quiet: 1;
> >         unsigned int progress: 1;
> > @@ -1796,6 +1798,10 @@ static int clone_submodule(struct module_clone_data *clone_data)
> >                         strvec_push(&cp.args, "--dissociate");
> >                 if (sm_gitdir && *sm_gitdir)
> >                         strvec_pushl(&cp.args, "--separate-git-dir", sm_gitdir, NULL);
> > +               if (clone_data->filter_options && clone_data->filter_options->choice)
> > +                       strvec_pushf(&cp.args, "--filter=%s",
> > +                                    expand_list_objects_filter_spec(
> > +                                            clone_data->filter_options));
> >                 if (clone_data->single_branch >= 0)
> >                         strvec_push(&cp.args, clone_data->single_branch ?
> >                                     "--single-branch" :
> > @@ -1852,6 +1858,7 @@ static int module_clone(int argc, const char **argv, const char *prefix)
> >  {
> >         int dissociate = 0, quiet = 0, progress = 0, require_init = 0;
> >         struct module_clone_data clone_data = MODULE_CLONE_DATA_INIT;
> > +       struct list_objects_filter_options filter_options;
> >
> >         struct option module_clone_options[] = {
> >                 OPT_STRING(0, "prefix", &clone_data.prefix,
> > @@ -1881,17 +1888,19 @@ static int module_clone(int argc, const char **argv, const char *prefix)
> >                            N_("disallow cloning into non-empty directory")),
> >                 OPT_BOOL(0, "single-branch", &clone_data.single_branch,
> >                          N_("clone only one branch, HEAD or --branch")),
> > +               OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
> >                 OPT_END()
> >         };
> >
> >         const char *const git_submodule_helper_usage[] = {
> >                 N_("git submodule--helper clone [--prefix=<path>] [--quiet] "
> >                    "[--reference <repository>] [--name <name>] [--depth <depth>] "
> > -                  "[--single-branch] "
> > +                  "[--single-branch] [--filter <filter-spec>]"
> >                    "--url <url> --path <path>"),
> >                 NULL
> >         };
> >
> > +       memset(&filter_options, 0, sizeof(filter_options));
> >         argc = parse_options(argc, argv, prefix, module_clone_options,
> >                              git_submodule_helper_usage, 0);
> >
> > @@ -1899,12 +1908,14 @@ static int module_clone(int argc, const char **argv, const char *prefix)
> >         clone_data.quiet = !!quiet;
> >         clone_data.progress = !!progress;
> >         clone_data.require_init = !!require_init;
> > +       clone_data.filter_options = &filter_options;
> >
> >         if (argc || !clone_data.url || !clone_data.path || !*(clone_data.path))
> >                 usage_with_options(git_submodule_helper_usage,
> >                                    module_clone_options);
> >
> >         clone_submodule(&clone_data);
> > +       list_objects_filter_release(&filter_options);
> >         return 0;
> >  }
> >
> > @@ -1994,6 +2005,7 @@ struct submodule_update_clone {
> >         const char *recursive_prefix;
> >         const char *prefix;
> >         int single_branch;
> > +       struct list_objects_filter_options *filter_options;
> >
> >         /* to be consumed by git-submodule.sh */
> >         struct update_clone_data *update_clone;
> > @@ -2154,6 +2166,9 @@ static int prepare_to_clone_next_submodule(const struct cache_entry *ce,
> >                 strvec_pushl(&child->args, "--prefix", suc->prefix, NULL);
> >         if (suc->recommend_shallow && sub->recommend_shallow == 1)
> >                 strvec_push(&child->args, "--depth=1");
> > +       if (suc->filter_options && suc->filter_options->choice)
> > +               strvec_pushf(&child->args, "--filter=%s",
> > +                            expand_list_objects_filter_spec(suc->filter_options));
> >         if (suc->require_init)
> >                 strvec_push(&child->args, "--require-init");
> >         strvec_pushl(&child->args, "--path", sub->path, NULL);
> > @@ -2498,6 +2513,8 @@ static int update_clone(int argc, const char **argv, const char *prefix)
> >         const char *update = NULL;
> >         struct pathspec pathspec;
> >         struct submodule_update_clone suc = SUBMODULE_UPDATE_CLONE_INIT;
> > +       struct list_objects_filter_options filter_options;
> > +       int ret;
> >
> >         struct option module_update_clone_options[] = {
> >                 OPT_STRING(0, "prefix", &prefix,
> > @@ -2528,6 +2545,7 @@ static int update_clone(int argc, const char **argv, const char *prefix)
> >                            N_("disallow cloning into non-empty directory")),
> >                 OPT_BOOL(0, "single-branch", &suc.single_branch,
> >                          N_("clone only one branch, HEAD or --branch")),
> > +               OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
> >                 OPT_END()
> >         };
> >
> > @@ -2540,20 +2558,26 @@ static int update_clone(int argc, const char **argv, const char *prefix)
> >         update_clone_config_from_gitmodules(&suc.max_jobs);
> >         git_config(git_update_clone_config, &suc.max_jobs);
> >
> > +       memset(&filter_options, 0, sizeof(filter_options));
> >         argc = parse_options(argc, argv, prefix, module_update_clone_options,
> >                              git_submodule_helper_usage, 0);
> > +       suc.filter_options = &filter_options;
> >
> >         if (update)
> >                 if (parse_submodule_update_strategy(update, &suc.update) < 0)
> >                         die(_("bad value for update parameter"));
> >
> > -       if (module_list_compute(argc, argv, prefix, &pathspec, &suc.list) < 0)
> > +       if (module_list_compute(argc, argv, prefix, &pathspec, &suc.list) < 0) {
> > +               list_objects_filter_release(&filter_options);
> >                 return 1;
> > +       }
> >
> >         if (pathspec.nr)
> >                 suc.warn_if_uninitialized = 1;
> >
> > -       return update_submodules(&suc);
> > +       ret = update_submodules(&suc);
> > +       list_objects_filter_release(&filter_options);
> > +       return ret;
> >  }
> >
> >  static int run_update_procedure(int argc, const char **argv, const char *prefix)
> > diff --git a/git-submodule.sh b/git-submodule.sh
> > index 652861aa66..926f6873d3 100755
> > --- a/git-submodule.sh
> > +++ b/git-submodule.sh
> > @@ -10,7 +10,7 @@ USAGE="[--quiet] [--cached]
> >     or: $dashless [--quiet] status [--cached] [--recursive] [--] [<path>...]
> >     or: $dashless [--quiet] init [--] [<path>...]
> >     or: $dashless [--quiet] deinit [-f|--force] (--all| [--] <path>...)
> > -   or: $dashless [--quiet] update [--init] [--remote] [-N|--no-fetch] [-f|--force] [--checkout|--merge|--rebase] [--[no-]recommend-shallow] [--reference <repository>] [--recursive] [--[no-]single-branch] [--] [<path>...]
> > +   or: $dashless [--quiet] update [--init [--filter=<filter-spec>]] [--remote] [-N|--no-fetch] [-f|--force] [--checkout|--merge|--rebase] [--[no-]recommend-shallow] [--reference <repository>] [--recursive] [--[no-]single-branch] [--] [<path>...]
> >     or: $dashless [--quiet] set-branch (--default|--branch <branch>) [--] <path>
> >     or: $dashless [--quiet] set-url [--] <path> <newurl>
> >     or: $dashless [--quiet] summary [--cached|--files] [--summary-limit <n>] [commit] [--] [<path>...]
> > @@ -49,6 +49,7 @@ dissociate=
> >  single_branch=
> >  jobs=
> >  recommend_shallow=
> > +filter=
> >
> >  die_if_unmatched ()
> >  {
> > @@ -347,6 +348,14 @@ cmd_update()
> >                 --no-single-branch)
> >                         single_branch="--no-single-branch"
> >                         ;;
> > +               --filter)
> > +                       case "$2" in '') usage ;; esac
> > +                       filter="--filter=$2"
> > +                       shift
> > +                       ;;
> > +               --filter=*)
> > +                       filter=$1
> 
> Shouldn't this be
>      filter="$1"
> ?  I think it'd work currently without the quotes, but seeing the
> discussion of special characters for --filter=combine in
> git-rev-list(1) make me worry that this could become unsafe in the
> future.

Yes, thanks for the catch. Will fix in V2.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2] clone, submodule: pass partial clone filters to submodules
  2022-01-21  3:32 [PATCH] clone, submodule: pass partial clone filters to submodules Josh Steadmon
  2022-01-22  1:49 ` Junio C Hamano
  2022-01-25 21:08 ` Elijah Newren
@ 2022-02-05  0:40 ` Josh Steadmon
  2022-02-05  0:54   ` Josh Steadmon
  2022-02-05  5:00 ` [PATCH v3] " Josh Steadmon
  3 siblings, 1 reply; 14+ messages in thread
From: Josh Steadmon @ 2022-02-05  0:40 UTC (permalink / raw)
  To: git; +Cc: gitster, newren

When cloning a repo with a --filter and with --recurse-submodules
enabled, the partial clone filter only applies to the top-level repo.
This can lead to unexpected bandwidth and disk usage for projects which
include large submodules. For example, a user might wish to make a
partial clone of Gerrit and would run:
`git clone --recurse-submodules --filter=blob:5k https://gerrit.googlesource.com/gerrit`.
However, only the superproject would be a partial clone; all the
submodules would have all blobs downloaded regardless of their size.
With this change, the same filter can also be applied to submodules,
meaning the expected bandwidth and disk savings apply consistently.

To avoid changing default behavior, add a new clone flag,
`--also-filter-submodules`. When this is set along with `--filter` and
`--recurse-submodules`, the filter spec is passed along to git-submodule
and git-submodule--helper, such that submodule clones also have the
filter applied.

This applies the same filter to the superproject and all submodules.
Users who need to customize the filter per-submodule would need to clone
with `--no-recurse-submodules` and then manually initialize each
submodule with the proper filter.

Applying filters to submodules should be safe thanks to Jonathan Tan's
recent work [1, 2, 3] eliminating the use of alternates as a method of
accessing submodule objects, so any submodule object access now triggers
a lazy fetch from the submodule's promisor remote if the accessed object
is missing. This patch is a reworked version of [4], which was created
prior to Jonathan Tan's work.

[1]: 8721e2e (Merge branch 'jt/partial-clone-submodule-1', 2021-07-16)
[2]: 11e5d0a (Merge branch 'jt/grep-wo-submodule-odb-as-alternate',
	2021-09-20)
[3]: 162a13b (Merge branch 'jt/no-abuse-alternate-odb-for-submodules',
	2021-10-25)
[4]: https://lore.kernel.org/git/52bf9d45b8e2b72ff32aa773f2415bf7b2b86da2.1563322192.git.steadmon@google.com/

Signed-off-by: Josh Steadmon <steadmon@google.com>
---
Changes since V2:
* Fixed a shell-quoting issue in git-submodule.sh
* Added a new clone flag (--also-filter-submodules) and config
  (clone.filterSubmodules) so that the default behavior remains the
  same, but users who want this option generally can set-and-forget it
  in their global config.

Range-diff against v1:
1:  50ebf7bd39 ! 1:  8678e721c2 clone, submodule: pass partial clone filters to submodules
    @@ Commit message
         clone, submodule: pass partial clone filters to submodules
     
         When cloning a repo with a --filter and with --recurse-submodules
    -    enabled, the partial clone filter only applies to
    -    the top-level repo. This can lead to unexpected bandwidth and disk
    -    usage for projects which include large submodules. For example, a user
    -    might wish to make a partial clone of Gerrit and would run:
    -    `git clone --recurse-submodules --filter=blob:5k
    -    https://gerrit.googlesource.com/gerrit`. However, only the superproject
    -    would be a partial clone; all the submodules would have all blobs
    -    downloaded regardless of their size. With this change, the same filter
    -    applies to submodules, meaning the expected bandwidth and disk savings
    -    apply consistently.
    +    enabled, the partial clone filter only applies to the top-level repo.
    +    This can lead to unexpected bandwidth and disk usage for projects which
    +    include large submodules. For example, a user might wish to make a
    +    partial clone of Gerrit and would run:
    +    `git clone --recurse-submodules --filter=blob:5k https://gerrit.googlesource.com/gerrit`.
    +    However, only the superproject would be a partial clone; all the
    +    submodules would have all blobs downloaded regardless of their size.
    +    With this change, the same filter can also be applied to submodules,
    +    meaning the expected bandwidth and disk savings apply consistently.
     
    -    Plumb the --filter argument from git-clone through git-submodule and
    -    git-submodule--helper, such that submodule clones also have the filter
    -    applied.
    +    To avoid changing default behavior, add a new clone flag,
    +    `--also-filter-submodules`. When this is set along with `--filter` and
    +    `--recurse-submodules`, the filter spec is passed along to git-submodule
    +    and git-submodule--helper, such that submodule clones also have the
    +    filter applied.
     
         This applies the same filter to the superproject and all submodules.
    -    Users who prefer the current behavior (i.e., a filter only on the
    -    superproject) would need to clone with `--no-recurse-submodules` and
    -    then manually initialize each submodule.
    +    Users who need to customize the filter per-submodule would need to clone
    +    with `--no-recurse-submodules` and then manually initialize each
    +    submodule with the proper filter.
     
         Applying filters to submodules should be safe thanks to Jonathan Tan's
         recent work [1, 2, 3] eliminating the use of alternates as a method of
    @@ Commit message
     
     
    + ## Documentation/config/clone.txt ##
    +@@ Documentation/config/clone.txt: clone.defaultRemoteName::
    + clone.rejectShallow::
    + 	Reject to clone a repository if it is a shallow one, can be overridden by
    + 	passing option `--reject-shallow` in command line. See linkgit:git-clone[1]
    ++
    ++clone.filterSubmodules::
    ++	If a partial clone filter is provided (see `--filter` in
    ++	linkgit:git-rev-list[1]) and `--recurse-submodules` is used, also apply
    ++	the filter to submodules.
    +
    + ## Documentation/git-clone.txt ##
    +@@ Documentation/git-clone.txt: SYNOPSIS
    + 	  [--depth <depth>] [--[no-]single-branch] [--no-tags]
    + 	  [--recurse-submodules[=<pathspec>]] [--[no-]shallow-submodules]
    + 	  [--[no-]remote-submodules] [--jobs <n>] [--sparse] [--[no-]reject-shallow]
    +-	  [--filter=<filter>] [--] <repository>
    ++	  [--filter=<filter> [--also-filter-submodules]] [--] <repository>
    + 	  [<directory>]
    + 
    + DESCRIPTION
    +@@ Documentation/git-clone.txt: objects from the source repository into a pack in the cloned repository.
    + 	at least `<size>`. For more details on filter specifications, see
    + 	the `--filter` option in linkgit:git-rev-list[1].
    + 
    ++--also-filter-submodules::
    ++	Also apply the partial clone filter to any submodules in the repository.
    ++	Requires `--filter` and `--recurse-submodules`. This can be turned on by
    ++	default by setting the `clone.filterSubmodules` config option.
    ++
    + --mirror::
    + 	Set up a mirror of the source repository.  This implies `--bare`.
    + 	Compared to `--bare`, `--mirror` not only maps local branches of the
    +
    + ## Documentation/git-submodule.txt ##
    +@@ Documentation/git-submodule.txt: If you really want to remove a submodule from the repository and commit
    + that use linkgit:git-rm[1] instead. See linkgit:gitsubmodules[7] for removal
    + options.
    + 
    +-update [--init] [--remote] [-N|--no-fetch] [--[no-]recommend-shallow] [-f|--force] [--checkout|--rebase|--merge] [--reference <repository>] [--depth <depth>] [--recursive] [--jobs <n>] [--[no-]single-branch] [--] [<path>...]::
    ++update [--init] [--remote] [-N|--no-fetch] [--[no-]recommend-shallow] [-f|--force] [--checkout|--rebase|--merge] [--reference <repository>] [--depth <depth>] [--recursive] [--jobs <n>] [--[no-]single-branch] [--filter <filter spec>] [--] [<path>...]::
    + +
    + --
    + Update the registered submodules to match what the superproject
    +@@ Documentation/git-submodule.txt: submodule with the `--init` option.
    + 
    + If `--recursive` is specified, this command will recurse into the
    + registered submodules, and update any nested submodules within.
    ++
    ++If `--filter <filter spec>` is specified, the given partial clone filter will be
    ++applied to the submodule. See linkgit:git-rev-list[1] for details on filter
    ++specifications.
    + --
    + set-branch (-b|--branch) <branch> [--] <path>::
    + set-branch (-d|--default) [--] <path>::
    +
      ## builtin/clone.c ##
    +@@ builtin/clone.c: static int option_dissociate;
    + static int max_jobs = -1;
    + static struct string_list option_recurse_submodules = STRING_LIST_INIT_NODUP;
    + static struct list_objects_filter_options filter_options;
    ++static int option_filter_submodules = -1;    /* unspecified */
    ++static int config_filter_submodules = -1;    /* unspecified */
    + static struct string_list server_options = STRING_LIST_INIT_NODUP;
    + static int option_remote_submodules;
    + 
    +@@ builtin/clone.c: static struct option builtin_clone_options[] = {
    + 	OPT_SET_INT('6', "ipv6", &family, N_("use IPv6 addresses only"),
    + 			TRANSPORT_FAMILY_IPV6),
    + 	OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
    ++	OPT_BOOL(0, "also-filter-submodules", &option_filter_submodules,
    ++		    N_("apply partial clone filters to submodules")),
    + 	OPT_BOOL(0, "remote-submodules", &option_remote_submodules,
    + 		    N_("any cloned submodules will use their remote-tracking branch")),
    + 	OPT_BOOL(0, "sparse", &option_sparse_checkout,
    +@@ builtin/clone.c: static int git_sparse_checkout_init(const char *repo)
    + 	return result;
    + }
    + 
    +-static int checkout(int submodule_progress)
    ++static int checkout(int submodule_progress, int filter_submodules)
    + {
    + 	struct object_id oid;
    + 	char *head;
     @@ builtin/clone.c: static int checkout(int submodule_progress)
      			strvec_push(&args, "--no-fetch");
      		}
      
    -+		if (filter_options.choice)
    ++		if (filter_submodules && filter_options.choice)
     +			strvec_pushf(&args, "--filter=%s",
     +				     expand_list_objects_filter_spec(&filter_options));
     +
      		if (option_single_branch >= 0)
      			strvec_push(&args, option_single_branch ?
      					       "--single-branch" :
    +@@ builtin/clone.c: static int git_clone_config(const char *k, const char *v, void *cb)
    + 	}
    + 	if (!strcmp(k, "clone.rejectshallow"))
    + 		config_reject_shallow = git_config_bool(k, v);
    ++	if (!strcmp(k, "clone.filtersubmodules"))
    ++		config_filter_submodules = git_config_bool(k, v);
    + 
    + 	return git_default_config(k, v, cb);
    + }
    +@@ builtin/clone.c: int cmd_clone(int argc, const char **argv, const char *prefix)
    + 	struct remote *remote;
    + 	int err = 0, complete_refs_before_fetch = 1;
    + 	int submodule_progress;
    ++	int filter_submodules = 0;
    + 
    + 	struct transport_ls_refs_options transport_ls_refs_options =
    + 		TRANSPORT_LS_REFS_OPTIONS_INIT;
    +@@ builtin/clone.c: int cmd_clone(int argc, const char **argv, const char *prefix)
    + 	if (option_reject_shallow != -1)
    + 		reject_shallow = option_reject_shallow;
    + 
    ++	/*
    ++	 * If option_filter_submodules is specified from CLI option,
    ++	 * ignore config_filter_submodules from git_clone_config.
    ++	 */
    ++	if (config_filter_submodules != -1)
    ++		filter_submodules = config_filter_submodules;
    ++	if (option_filter_submodules != -1)
    ++		filter_submodules = option_filter_submodules;
    ++
    ++	/*
    ++	 * Exit if the user seems to be doing something silly with submodule
    ++	 * filter flags (but not with filter configs, as those should be
    ++	 * set-and-forget).
    ++	 */
    ++	if (option_filter_submodules > 0 && !filter_options.choice)
    ++		die(_("the option '%s' requires '%s'"),
    ++		    "--also-filter-submodules", "--filter");
    ++	if (option_filter_submodules > 0 && !option_recurse_submodules.nr)
    ++		die(_("the option '%s' requires '%s'"),
    ++		    "--also-filter-submodules", "--recurse-submodules");
    ++
    + 	/*
    + 	 * apply the remote name provided by --origin only after this second
    + 	 * call to git_config, to ensure it overrides all config-based values.
    +@@ builtin/clone.c: int cmd_clone(int argc, const char **argv, const char *prefix)
    + 	}
    + 
    + 	junk_mode = JUNK_LEAVE_REPO;
    +-	err = checkout(submodule_progress);
    ++	err = checkout(submodule_progress, filter_submodules);
    + 
    + 	free(remote_name);
    + 	strbuf_release(&reflog_msg);
     
      ## builtin/submodule--helper.c ##
     @@
    @@ git-submodule.sh: cmd_update()
     +			shift
     +			;;
     +		--filter=*)
    -+			filter=$1
    ++			filter="$1"
     +			;;
      		--)
      			shift
    @@ t/t5617-clone-submodules-remote.sh: test_expect_success 'clone with --single-bra
     +# do basic partial clone from "srv.bare"
     +# confirm partial clone was registered in the local config for super and sub.
     +test_expect_success 'clone with --filter' '
    -+	git clone --recurse-submodules --filter blob:none "file://$pwd/srv.bare" super_clone &&
    ++	git clone --recurse-submodules \
    ++		--filter blob:none --also-filter-submodules \
    ++		"file://$pwd/srv.bare" super_clone &&
     +	test_cmp_config -C super_clone 1 core.repositoryformatversion &&
     +	test_cmp_config -C super_clone blob:none remote.origin.partialclonefilter &&
     +	test_cmp_config -C super_clone/sub 1 core.repositoryformatversion &&
     +	test_cmp_config -C super_clone/sub blob:none remote.origin.partialclonefilter
     +'
    ++
    ++# check that clone.filterSubmodules works (--also-filter-submodules can be
    ++# omitted)
    ++test_expect_success 'filters applied with clone.filterSubmodules' '
    ++	test_config_global clone.filterSubmodules true &&
    ++	git clone --recurse-submodules --filter blob:none \
    ++		"file://$pwd/srv.bare" super_clone2 &&
    ++	test_cmp_config -C super_clone2 1 core.repositoryformatversion &&
    ++	test_cmp_config -C super_clone2 blob:none remote.origin.partialclonefilter &&
    ++	test_cmp_config -C super_clone2/sub 1 core.repositoryformatversion &&
    ++	test_cmp_config -C super_clone2/sub blob:none remote.origin.partialclonefilter
    ++'
    ++
    ++test_expect_success '--no-also-filter-submodules overrides clone.filterSubmodules=true' '
    ++	test_config_global clone.filterSubmodules true &&
    ++	git clone --recurse-submodules --filter blob:none \
    ++		--no-also-filter-submodules \
    ++		"file://$pwd/srv.bare" super_clone3 &&
    ++	test_cmp_config -C super_clone3 1 core.repositoryformatversion &&
    ++	test_cmp_config -C super_clone3 blob:none remote.origin.partialclonefilter &&
    ++	test_cmp_config -C super_clone3/sub 0 core.repositoryformatversion
    ++'
     +
      test_done
     
    @@ t/t7814-grep-recurse-submodules.sh: test_expect_failure 'grep saves textconv cac
     +	git init super/sub &&
     +	(
     +		cd super &&
    -+		test_commit --no-tag "Add file in superproject" super-file "Some content for super-file" &&
    -+		test_commit -C sub --no-tag "Add file in submodule" sub-file "Some content for sub-file" &&
    ++		test_commit --no-tag "Add file in superproject" \
    ++			super-file "Some content for super-file" &&
    ++		test_commit -C sub --no-tag "Add file in submodule" \
    ++			sub-file "Some content for sub-file" &&
     +		git submodule add ./sub &&
     +		git commit -m "Add other as submodule sub" &&
     +		test_tick &&
    -+		test_commit -C sub --no-tag --append "Update file in submodule" sub-file "Some more content for sub-file" &&
    ++		test_commit -C sub --no-tag --append "Update file in submodule" \
    ++			sub-file "Some more content for sub-file" &&
     +		git add sub &&
     +		git commit -m "Update submodule" &&
     +		test_tick &&
    @@ t/t7814-grep-recurse-submodules.sh: test_expect_failure 'grep saves textconv cac
     +		git -C sub config --local uploadpack.allowanysha1inwant 1
     +	) &&
     +	# Clone the superproject & submodule, then make sure we can lazy-fetch submodule objects.
    -+	git clone --filter=blob:none --recurse-submodules "file://$(pwd)/super" partial &&
    ++	git clone --filter=blob:none --also-filter-submodules \
    ++		--recurse-submodules "file://$(pwd)/super" partial &&
     +	(
     +		cd partial &&
     +		cat >expect <<-\EOF &&
    @@ t/t7814-grep-recurse-submodules.sh: test_expect_failure 'grep saves textconv cac
     +		HEAD^:super-file:Some content for super-file
     +		EOF
     +
    -+		GIT_TRACE2_EVENT="$(pwd)/trace2.log" git grep -e content --recurse-submodules HEAD^ >actual &&
    ++		GIT_TRACE2_EVENT="$(pwd)/trace2.log" git grep -e content \
    ++			--recurse-submodules HEAD^ >actual &&
     +		test_cmp expect actual &&
     +		# Verify that we actually fetched data from the promisor remote:
     +		grep \"category\":\"promisor\",\"key\":\"fetch_count\",\"value\":\"1\" trace2.log >/dev/null

 Documentation/config/clone.txt     |  5 ++++
 Documentation/git-clone.txt        |  7 ++++-
 Documentation/git-submodule.txt    |  6 ++++-
 builtin/clone.c                    | 36 ++++++++++++++++++++++++--
 builtin/submodule--helper.c        | 30 +++++++++++++++++++---
 git-submodule.sh                   | 17 ++++++++++++-
 t/t5617-clone-submodules-remote.sh | 41 ++++++++++++++++++++++++++++++
 t/t7814-grep-recurse-submodules.sh | 41 ++++++++++++++++++++++++++++++
 8 files changed, 175 insertions(+), 8 deletions(-)

diff --git a/Documentation/config/clone.txt b/Documentation/config/clone.txt
index 7bcfbd18a5..26f4fb137a 100644
--- a/Documentation/config/clone.txt
+++ b/Documentation/config/clone.txt
@@ -6,3 +6,8 @@ clone.defaultRemoteName::
 clone.rejectShallow::
 	Reject to clone a repository if it is a shallow one, can be overridden by
 	passing option `--reject-shallow` in command line. See linkgit:git-clone[1]
+
+clone.filterSubmodules::
+	If a partial clone filter is provided (see `--filter` in
+	linkgit:git-rev-list[1]) and `--recurse-submodules` is used, also apply
+	the filter to submodules.
diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt
index 984d194934..632bd1348e 100644
--- a/Documentation/git-clone.txt
+++ b/Documentation/git-clone.txt
@@ -16,7 +16,7 @@ SYNOPSIS
 	  [--depth <depth>] [--[no-]single-branch] [--no-tags]
 	  [--recurse-submodules[=<pathspec>]] [--[no-]shallow-submodules]
 	  [--[no-]remote-submodules] [--jobs <n>] [--sparse] [--[no-]reject-shallow]
-	  [--filter=<filter>] [--] <repository>
+	  [--filter=<filter> [--also-filter-submodules]] [--] <repository>
 	  [<directory>]
 
 DESCRIPTION
@@ -182,6 +182,11 @@ objects from the source repository into a pack in the cloned repository.
 	at least `<size>`. For more details on filter specifications, see
 	the `--filter` option in linkgit:git-rev-list[1].
 
+--also-filter-submodules::
+	Also apply the partial clone filter to any submodules in the repository.
+	Requires `--filter` and `--recurse-submodules`. This can be turned on by
+	default by setting the `clone.filterSubmodules` config option.
+
 --mirror::
 	Set up a mirror of the source repository.  This implies `--bare`.
 	Compared to `--bare`, `--mirror` not only maps local branches of the
diff --git a/Documentation/git-submodule.txt b/Documentation/git-submodule.txt
index 7e5f995f77..4d3ab6b9f9 100644
--- a/Documentation/git-submodule.txt
+++ b/Documentation/git-submodule.txt
@@ -133,7 +133,7 @@ If you really want to remove a submodule from the repository and commit
 that use linkgit:git-rm[1] instead. See linkgit:gitsubmodules[7] for removal
 options.
 
-update [--init] [--remote] [-N|--no-fetch] [--[no-]recommend-shallow] [-f|--force] [--checkout|--rebase|--merge] [--reference <repository>] [--depth <depth>] [--recursive] [--jobs <n>] [--[no-]single-branch] [--] [<path>...]::
+update [--init] [--remote] [-N|--no-fetch] [--[no-]recommend-shallow] [-f|--force] [--checkout|--rebase|--merge] [--reference <repository>] [--depth <depth>] [--recursive] [--jobs <n>] [--[no-]single-branch] [--filter <filter spec>] [--] [<path>...]::
 +
 --
 Update the registered submodules to match what the superproject
@@ -177,6 +177,10 @@ submodule with the `--init` option.
 
 If `--recursive` is specified, this command will recurse into the
 registered submodules, and update any nested submodules within.
+
+If `--filter <filter spec>` is specified, the given partial clone filter will be
+applied to the submodule. See linkgit:git-rev-list[1] for details on filter
+specifications.
 --
 set-branch (-b|--branch) <branch> [--] <path>::
 set-branch (-d|--default) [--] <path>::
diff --git a/builtin/clone.c b/builtin/clone.c
index 727e16e0ae..fb605e4c8d 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -71,6 +71,8 @@ static int option_dissociate;
 static int max_jobs = -1;
 static struct string_list option_recurse_submodules = STRING_LIST_INIT_NODUP;
 static struct list_objects_filter_options filter_options;
+static int option_filter_submodules = -1;    /* unspecified */
+static int config_filter_submodules = -1;    /* unspecified */
 static struct string_list server_options = STRING_LIST_INIT_NODUP;
 static int option_remote_submodules;
 
@@ -150,6 +152,8 @@ static struct option builtin_clone_options[] = {
 	OPT_SET_INT('6', "ipv6", &family, N_("use IPv6 addresses only"),
 			TRANSPORT_FAMILY_IPV6),
 	OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
+	OPT_BOOL(0, "also-filter-submodules", &option_filter_submodules,
+		    N_("apply partial clone filters to submodules")),
 	OPT_BOOL(0, "remote-submodules", &option_remote_submodules,
 		    N_("any cloned submodules will use their remote-tracking branch")),
 	OPT_BOOL(0, "sparse", &option_sparse_checkout,
@@ -650,7 +654,7 @@ static int git_sparse_checkout_init(const char *repo)
 	return result;
 }
 
-static int checkout(int submodule_progress)
+static int checkout(int submodule_progress, int filter_submodules)
 {
 	struct object_id oid;
 	char *head;
@@ -729,6 +733,10 @@ static int checkout(int submodule_progress)
 			strvec_push(&args, "--no-fetch");
 		}
 
+		if (filter_submodules && filter_options.choice)
+			strvec_pushf(&args, "--filter=%s",
+				     expand_list_objects_filter_spec(&filter_options));
+
 		if (option_single_branch >= 0)
 			strvec_push(&args, option_single_branch ?
 					       "--single-branch" :
@@ -749,6 +757,8 @@ static int git_clone_config(const char *k, const char *v, void *cb)
 	}
 	if (!strcmp(k, "clone.rejectshallow"))
 		config_reject_shallow = git_config_bool(k, v);
+	if (!strcmp(k, "clone.filtersubmodules"))
+		config_filter_submodules = git_config_bool(k, v);
 
 	return git_default_config(k, v, cb);
 }
@@ -871,6 +881,7 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
 	struct remote *remote;
 	int err = 0, complete_refs_before_fetch = 1;
 	int submodule_progress;
+	int filter_submodules = 0;
 
 	struct transport_ls_refs_options transport_ls_refs_options =
 		TRANSPORT_LS_REFS_OPTIONS_INIT;
@@ -1066,6 +1077,27 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
 	if (option_reject_shallow != -1)
 		reject_shallow = option_reject_shallow;
 
+	/*
+	 * If option_filter_submodules is specified from CLI option,
+	 * ignore config_filter_submodules from git_clone_config.
+	 */
+	if (config_filter_submodules != -1)
+		filter_submodules = config_filter_submodules;
+	if (option_filter_submodules != -1)
+		filter_submodules = option_filter_submodules;
+
+	/*
+	 * Exit if the user seems to be doing something silly with submodule
+	 * filter flags (but not with filter configs, as those should be
+	 * set-and-forget).
+	 */
+	if (option_filter_submodules > 0 && !filter_options.choice)
+		die(_("the option '%s' requires '%s'"),
+		    "--also-filter-submodules", "--filter");
+	if (option_filter_submodules > 0 && !option_recurse_submodules.nr)
+		die(_("the option '%s' requires '%s'"),
+		    "--also-filter-submodules", "--recurse-submodules");
+
 	/*
 	 * apply the remote name provided by --origin only after this second
 	 * call to git_config, to ensure it overrides all config-based values.
@@ -1299,7 +1331,7 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
 	}
 
 	junk_mode = JUNK_LEAVE_REPO;
-	err = checkout(submodule_progress);
+	err = checkout(submodule_progress, filter_submodules);
 
 	free(remote_name);
 	strbuf_release(&reflog_msg);
diff --git a/builtin/submodule--helper.c b/builtin/submodule--helper.c
index c5d3fc3817..11552970f2 100644
--- a/builtin/submodule--helper.c
+++ b/builtin/submodule--helper.c
@@ -20,6 +20,7 @@
 #include "diff.h"
 #include "object-store.h"
 #include "advice.h"
+#include "list-objects-filter-options.h"
 
 #define OPT_QUIET (1 << 0)
 #define OPT_CACHED (1 << 1)
@@ -1630,6 +1631,7 @@ struct module_clone_data {
 	const char *name;
 	const char *url;
 	const char *depth;
+	struct list_objects_filter_options *filter_options;
 	struct string_list reference;
 	unsigned int quiet: 1;
 	unsigned int progress: 1;
@@ -1796,6 +1798,10 @@ static int clone_submodule(struct module_clone_data *clone_data)
 			strvec_push(&cp.args, "--dissociate");
 		if (sm_gitdir && *sm_gitdir)
 			strvec_pushl(&cp.args, "--separate-git-dir", sm_gitdir, NULL);
+		if (clone_data->filter_options && clone_data->filter_options->choice)
+			strvec_pushf(&cp.args, "--filter=%s",
+				     expand_list_objects_filter_spec(
+					     clone_data->filter_options));
 		if (clone_data->single_branch >= 0)
 			strvec_push(&cp.args, clone_data->single_branch ?
 				    "--single-branch" :
@@ -1852,6 +1858,7 @@ static int module_clone(int argc, const char **argv, const char *prefix)
 {
 	int dissociate = 0, quiet = 0, progress = 0, require_init = 0;
 	struct module_clone_data clone_data = MODULE_CLONE_DATA_INIT;
+	struct list_objects_filter_options filter_options;
 
 	struct option module_clone_options[] = {
 		OPT_STRING(0, "prefix", &clone_data.prefix,
@@ -1881,17 +1888,19 @@ static int module_clone(int argc, const char **argv, const char *prefix)
 			   N_("disallow cloning into non-empty directory")),
 		OPT_BOOL(0, "single-branch", &clone_data.single_branch,
 			 N_("clone only one branch, HEAD or --branch")),
+		OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
 		OPT_END()
 	};
 
 	const char *const git_submodule_helper_usage[] = {
 		N_("git submodule--helper clone [--prefix=<path>] [--quiet] "
 		   "[--reference <repository>] [--name <name>] [--depth <depth>] "
-		   "[--single-branch] "
+		   "[--single-branch] [--filter <filter-spec>]"
 		   "--url <url> --path <path>"),
 		NULL
 	};
 
+	memset(&filter_options, 0, sizeof(filter_options));
 	argc = parse_options(argc, argv, prefix, module_clone_options,
 			     git_submodule_helper_usage, 0);
 
@@ -1899,12 +1908,14 @@ static int module_clone(int argc, const char **argv, const char *prefix)
 	clone_data.quiet = !!quiet;
 	clone_data.progress = !!progress;
 	clone_data.require_init = !!require_init;
+	clone_data.filter_options = &filter_options;
 
 	if (argc || !clone_data.url || !clone_data.path || !*(clone_data.path))
 		usage_with_options(git_submodule_helper_usage,
 				   module_clone_options);
 
 	clone_submodule(&clone_data);
+	list_objects_filter_release(&filter_options);
 	return 0;
 }
 
@@ -1994,6 +2005,7 @@ struct submodule_update_clone {
 	const char *recursive_prefix;
 	const char *prefix;
 	int single_branch;
+	struct list_objects_filter_options *filter_options;
 
 	/* to be consumed by git-submodule.sh */
 	struct update_clone_data *update_clone;
@@ -2154,6 +2166,9 @@ static int prepare_to_clone_next_submodule(const struct cache_entry *ce,
 		strvec_pushl(&child->args, "--prefix", suc->prefix, NULL);
 	if (suc->recommend_shallow && sub->recommend_shallow == 1)
 		strvec_push(&child->args, "--depth=1");
+	if (suc->filter_options && suc->filter_options->choice)
+		strvec_pushf(&child->args, "--filter=%s",
+			     expand_list_objects_filter_spec(suc->filter_options));
 	if (suc->require_init)
 		strvec_push(&child->args, "--require-init");
 	strvec_pushl(&child->args, "--path", sub->path, NULL);
@@ -2498,6 +2513,8 @@ static int update_clone(int argc, const char **argv, const char *prefix)
 	const char *update = NULL;
 	struct pathspec pathspec;
 	struct submodule_update_clone suc = SUBMODULE_UPDATE_CLONE_INIT;
+	struct list_objects_filter_options filter_options;
+	int ret;
 
 	struct option module_update_clone_options[] = {
 		OPT_STRING(0, "prefix", &prefix,
@@ -2528,6 +2545,7 @@ static int update_clone(int argc, const char **argv, const char *prefix)
 			   N_("disallow cloning into non-empty directory")),
 		OPT_BOOL(0, "single-branch", &suc.single_branch,
 			 N_("clone only one branch, HEAD or --branch")),
+		OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
 		OPT_END()
 	};
 
@@ -2540,20 +2558,26 @@ static int update_clone(int argc, const char **argv, const char *prefix)
 	update_clone_config_from_gitmodules(&suc.max_jobs);
 	git_config(git_update_clone_config, &suc.max_jobs);
 
+	memset(&filter_options, 0, sizeof(filter_options));
 	argc = parse_options(argc, argv, prefix, module_update_clone_options,
 			     git_submodule_helper_usage, 0);
+	suc.filter_options = &filter_options;
 
 	if (update)
 		if (parse_submodule_update_strategy(update, &suc.update) < 0)
 			die(_("bad value for update parameter"));
 
-	if (module_list_compute(argc, argv, prefix, &pathspec, &suc.list) < 0)
+	if (module_list_compute(argc, argv, prefix, &pathspec, &suc.list) < 0) {
+		list_objects_filter_release(&filter_options);
 		return 1;
+	}
 
 	if (pathspec.nr)
 		suc.warn_if_uninitialized = 1;
 
-	return update_submodules(&suc);
+	ret = update_submodules(&suc);
+	list_objects_filter_release(&filter_options);
+	return ret;
 }
 
 static int run_update_procedure(int argc, const char **argv, const char *prefix)
diff --git a/git-submodule.sh b/git-submodule.sh
index 652861aa66..87772ac891 100755
--- a/git-submodule.sh
+++ b/git-submodule.sh
@@ -10,7 +10,7 @@ USAGE="[--quiet] [--cached]
    or: $dashless [--quiet] status [--cached] [--recursive] [--] [<path>...]
    or: $dashless [--quiet] init [--] [<path>...]
    or: $dashless [--quiet] deinit [-f|--force] (--all| [--] <path>...)
-   or: $dashless [--quiet] update [--init] [--remote] [-N|--no-fetch] [-f|--force] [--checkout|--merge|--rebase] [--[no-]recommend-shallow] [--reference <repository>] [--recursive] [--[no-]single-branch] [--] [<path>...]
+   or: $dashless [--quiet] update [--init [--filter=<filter-spec>]] [--remote] [-N|--no-fetch] [-f|--force] [--checkout|--merge|--rebase] [--[no-]recommend-shallow] [--reference <repository>] [--recursive] [--[no-]single-branch] [--] [<path>...]
    or: $dashless [--quiet] set-branch (--default|--branch <branch>) [--] <path>
    or: $dashless [--quiet] set-url [--] <path> <newurl>
    or: $dashless [--quiet] summary [--cached|--files] [--summary-limit <n>] [commit] [--] [<path>...]
@@ -49,6 +49,7 @@ dissociate=
 single_branch=
 jobs=
 recommend_shallow=
+filter=
 
 die_if_unmatched ()
 {
@@ -347,6 +348,14 @@ cmd_update()
 		--no-single-branch)
 			single_branch="--no-single-branch"
 			;;
+		--filter)
+			case "$2" in '') usage ;; esac
+			filter="--filter=$2"
+			shift
+			;;
+		--filter=*)
+			filter="$1"
+			;;
 		--)
 			shift
 			break
@@ -361,6 +370,11 @@ cmd_update()
 		shift
 	done
 
+	if test -n "$filter" && test "$init" != "1"
+	then
+		usage
+	fi
+
 	if test -n "$init"
 	then
 		cmd_init "--" "$@" || return
@@ -379,6 +393,7 @@ cmd_update()
 		$single_branch \
 		$recommend_shallow \
 		$jobs \
+		$filter \
 		-- \
 		"$@" || echo "#unmatched" $?
 	} | {
diff --git a/t/t5617-clone-submodules-remote.sh b/t/t5617-clone-submodules-remote.sh
index e2dbb4eaba..f7208b0565 100755
--- a/t/t5617-clone-submodules-remote.sh
+++ b/t/t5617-clone-submodules-remote.sh
@@ -28,6 +28,13 @@ test_expect_success 'setup' '
 	)
 '
 
+# bare clone giving "srv.bare" for use as our server.
+test_expect_success 'setup bare clone for server' '
+	git clone --bare "file://$(pwd)/." srv.bare &&
+	git -C srv.bare config --local uploadpack.allowfilter 1 &&
+	git -C srv.bare config --local uploadpack.allowanysha1inwant 1
+'
+
 test_expect_success 'clone with --no-remote-submodules' '
 	test_when_finished "rm -rf super_clone" &&
 	git clone --recurse-submodules --no-remote-submodules "file://$pwd/." super_clone &&
@@ -65,4 +72,38 @@ test_expect_success 'clone with --single-branch' '
 	)
 '
 
+# do basic partial clone from "srv.bare"
+# confirm partial clone was registered in the local config for super and sub.
+test_expect_success 'clone with --filter' '
+	git clone --recurse-submodules \
+		--filter blob:none --also-filter-submodules \
+		"file://$pwd/srv.bare" super_clone &&
+	test_cmp_config -C super_clone 1 core.repositoryformatversion &&
+	test_cmp_config -C super_clone blob:none remote.origin.partialclonefilter &&
+	test_cmp_config -C super_clone/sub 1 core.repositoryformatversion &&
+	test_cmp_config -C super_clone/sub blob:none remote.origin.partialclonefilter
+'
+
+# check that clone.filterSubmodules works (--also-filter-submodules can be
+# omitted)
+test_expect_success 'filters applied with clone.filterSubmodules' '
+	test_config_global clone.filterSubmodules true &&
+	git clone --recurse-submodules --filter blob:none \
+		"file://$pwd/srv.bare" super_clone2 &&
+	test_cmp_config -C super_clone2 1 core.repositoryformatversion &&
+	test_cmp_config -C super_clone2 blob:none remote.origin.partialclonefilter &&
+	test_cmp_config -C super_clone2/sub 1 core.repositoryformatversion &&
+	test_cmp_config -C super_clone2/sub blob:none remote.origin.partialclonefilter
+'
+
+test_expect_success '--no-also-filter-submodules overrides clone.filterSubmodules=true' '
+	test_config_global clone.filterSubmodules true &&
+	git clone --recurse-submodules --filter blob:none \
+		--no-also-filter-submodules \
+		"file://$pwd/srv.bare" super_clone3 &&
+	test_cmp_config -C super_clone3 1 core.repositoryformatversion &&
+	test_cmp_config -C super_clone3 blob:none remote.origin.partialclonefilter &&
+	test_cmp_config -C super_clone3/sub 0 core.repositoryformatversion
+'
+
 test_done
diff --git a/t/t7814-grep-recurse-submodules.sh b/t/t7814-grep-recurse-submodules.sh
index 058e5d0c96..48ab7a05c4 100755
--- a/t/t7814-grep-recurse-submodules.sh
+++ b/t/t7814-grep-recurse-submodules.sh
@@ -544,4 +544,45 @@ test_expect_failure 'grep saves textconv cache in the appropriate repository' '
 	test_path_is_file "$sub_textconv_cache"
 '
 
+test_expect_success 'grep partially-cloned submodule' '
+	# Set up clean superproject and submodule for partial cloning.
+	git init super &&
+	git init super/sub &&
+	(
+		cd super &&
+		test_commit --no-tag "Add file in superproject" \
+			super-file "Some content for super-file" &&
+		test_commit -C sub --no-tag "Add file in submodule" \
+			sub-file "Some content for sub-file" &&
+		git submodule add ./sub &&
+		git commit -m "Add other as submodule sub" &&
+		test_tick &&
+		test_commit -C sub --no-tag --append "Update file in submodule" \
+			sub-file "Some more content for sub-file" &&
+		git add sub &&
+		git commit -m "Update submodule" &&
+		test_tick &&
+		git config --local uploadpack.allowfilter 1 &&
+		git config --local uploadpack.allowanysha1inwant 1 &&
+		git -C sub config --local uploadpack.allowfilter 1 &&
+		git -C sub config --local uploadpack.allowanysha1inwant 1
+	) &&
+	# Clone the superproject & submodule, then make sure we can lazy-fetch submodule objects.
+	git clone --filter=blob:none --also-filter-submodules \
+		--recurse-submodules "file://$(pwd)/super" partial &&
+	(
+		cd partial &&
+		cat >expect <<-\EOF &&
+		HEAD^:sub/sub-file:Some content for sub-file
+		HEAD^:super-file:Some content for super-file
+		EOF
+
+		GIT_TRACE2_EVENT="$(pwd)/trace2.log" git grep -e content \
+			--recurse-submodules HEAD^ >actual &&
+		test_cmp expect actual &&
+		# Verify that we actually fetched data from the promisor remote:
+		grep \"category\":\"promisor\",\"key\":\"fetch_count\",\"value\":\"1\" trace2.log >/dev/null
+	)
+'
+
 test_done

base-commit: b56bd95bbc8f716cb8cbb5fdc18b9b0f00323c6a
-- 
2.35.0.263.gb82422642f-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] clone, submodule: pass partial clone filters to submodules
  2022-02-05  0:40 ` [PATCH v2] " Josh Steadmon
@ 2022-02-05  0:54   ` Josh Steadmon
  2022-02-05  1:00     ` Josh Steadmon
  0 siblings, 1 reply; 14+ messages in thread
From: Josh Steadmon @ 2022-02-05  0:54 UTC (permalink / raw)
  To: git; +Cc: gitster, newren

Hmm, t5617 fails with GIT_TEST_DEFAULT_HASH=sha256. Not sure why yet,
but I'll investigate and send a V3 with a fix.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] clone, submodule: pass partial clone filters to submodules
  2022-02-05  0:54   ` Josh Steadmon
@ 2022-02-05  1:00     ` Josh Steadmon
  0 siblings, 0 replies; 14+ messages in thread
From: Josh Steadmon @ 2022-02-05  1:00 UTC (permalink / raw)
  To: git, gitster, newren

On 2022.02.04 16:54, Josh Steadmon wrote:
> Hmm, t5617 fails with GIT_TEST_DEFAULT_HASH=sha256. Not sure why yet,
> but I'll investigate and send a V3 with a fix.

Ah, checking `core.repositoryformatversion` == 0 is not a good way to
verify that a repo is not a partial clone. I'll figure out an alternate
method for V3.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v3] clone, submodule: pass partial clone filters to submodules
  2022-01-21  3:32 [PATCH] clone, submodule: pass partial clone filters to submodules Josh Steadmon
                   ` (2 preceding siblings ...)
  2022-02-05  0:40 ` [PATCH v2] " Josh Steadmon
@ 2022-02-05  5:00 ` Josh Steadmon
  2022-02-09 22:44   ` Jonathan Tan
  2022-02-19 17:30   ` [PATCH] t5617,t7814: remove unnecessary 'uploadpack.allowanysha1inwant' Philippe Blain
  3 siblings, 2 replies; 14+ messages in thread
From: Josh Steadmon @ 2022-02-05  5:00 UTC (permalink / raw)
  To: git; +Cc: gitster, newren

When cloning a repo with a --filter and with --recurse-submodules
enabled, the partial clone filter only applies to the top-level repo.
This can lead to unexpected bandwidth and disk usage for projects which
include large submodules. For example, a user might wish to make a
partial clone of Gerrit and would run:
`git clone --recurse-submodules --filter=blob:5k https://gerrit.googlesource.com/gerrit`.
However, only the superproject would be a partial clone; all the
submodules would have all blobs downloaded regardless of their size.
With this change, the same filter can also be applied to submodules,
meaning the expected bandwidth and disk savings apply consistently.

To avoid changing default behavior, add a new clone flag,
`--also-filter-submodules`. When this is set along with `--filter` and
`--recurse-submodules`, the filter spec is passed along to git-submodule
and git-submodule--helper, such that submodule clones also have the
filter applied.

This applies the same filter to the superproject and all submodules.
Users who need to customize the filter per-submodule would need to clone
with `--no-recurse-submodules` and then manually initialize each
submodule with the proper filter.

Applying filters to submodules should be safe thanks to Jonathan Tan's
recent work [1, 2, 3] eliminating the use of alternates as a method of
accessing submodule objects, so any submodule object access now triggers
a lazy fetch from the submodule's promisor remote if the accessed object
is missing. This patch is a reworked version of [4], which was created
prior to Jonathan Tan's work.

[1]: 8721e2e (Merge branch 'jt/partial-clone-submodule-1', 2021-07-16)
[2]: 11e5d0a (Merge branch 'jt/grep-wo-submodule-odb-as-alternate',
	2021-09-20)
[3]: 162a13b (Merge branch 'jt/no-abuse-alternate-odb-for-submodules',
	2021-10-25)
[4]: https://lore.kernel.org/git/52bf9d45b8e2b72ff32aa773f2415bf7b2b86da2.1563322192.git.steadmon@google.com/

Signed-off-by: Josh Steadmon <steadmon@google.com>
---
Changes since V2:
* Used a more reliable method for testing that a repository is a partial
  clone. Checking the remote.*.promisor field should work regardless of
  which hash algorithm (and therefore repository format) is used.

Changes since V1:
* Fixed a shell-quoting issue in git-submodule.sh
* Added a new clone flag (--also-filter-submodules) and config
  (clone.filterSubmodules) so that the default behavior remains the
  same, but users who want this option generally can set-and-forget it
  in their global config.

Range-diff against v2:
1:  8678e721c2 ! 1:  690d2316ad clone, submodule: pass partial clone filters to submodules
    @@ t/t5617-clone-submodules-remote.sh: test_expect_success 'clone with --single-bra
     +	git clone --recurse-submodules \
     +		--filter blob:none --also-filter-submodules \
     +		"file://$pwd/srv.bare" super_clone &&
    -+	test_cmp_config -C super_clone 1 core.repositoryformatversion &&
    ++	test_cmp_config -C super_clone true remote.origin.promisor &&
     +	test_cmp_config -C super_clone blob:none remote.origin.partialclonefilter &&
    -+	test_cmp_config -C super_clone/sub 1 core.repositoryformatversion &&
    ++	test_cmp_config -C super_clone/sub true remote.origin.promisor &&
     +	test_cmp_config -C super_clone/sub blob:none remote.origin.partialclonefilter
     +'
     +
    @@ t/t5617-clone-submodules-remote.sh: test_expect_success 'clone with --single-bra
     +	test_config_global clone.filterSubmodules true &&
     +	git clone --recurse-submodules --filter blob:none \
     +		"file://$pwd/srv.bare" super_clone2 &&
    -+	test_cmp_config -C super_clone2 1 core.repositoryformatversion &&
    ++	test_cmp_config -C super_clone2 true remote.origin.promisor &&
     +	test_cmp_config -C super_clone2 blob:none remote.origin.partialclonefilter &&
    -+	test_cmp_config -C super_clone2/sub 1 core.repositoryformatversion &&
    ++	test_cmp_config -C super_clone2/sub true remote.origin.promisor &&
     +	test_cmp_config -C super_clone2/sub blob:none remote.origin.partialclonefilter
     +'
     +
    @@ t/t5617-clone-submodules-remote.sh: test_expect_success 'clone with --single-bra
     +	git clone --recurse-submodules --filter blob:none \
     +		--no-also-filter-submodules \
     +		"file://$pwd/srv.bare" super_clone3 &&
    -+	test_cmp_config -C super_clone3 1 core.repositoryformatversion &&
    ++	test_cmp_config -C super_clone3 true remote.origin.promisor &&
     +	test_cmp_config -C super_clone3 blob:none remote.origin.partialclonefilter &&
    -+	test_cmp_config -C super_clone3/sub 0 core.repositoryformatversion
    ++	test_cmp_config -C super_clone3/sub false --default false remote.origin.promisor
     +'
     +
      test_done

 Documentation/config/clone.txt     |  5 ++++
 Documentation/git-clone.txt        |  7 ++++-
 Documentation/git-submodule.txt    |  6 ++++-
 builtin/clone.c                    | 36 ++++++++++++++++++++++++--
 builtin/submodule--helper.c        | 30 +++++++++++++++++++---
 git-submodule.sh                   | 17 ++++++++++++-
 t/t5617-clone-submodules-remote.sh | 41 ++++++++++++++++++++++++++++++
 t/t7814-grep-recurse-submodules.sh | 41 ++++++++++++++++++++++++++++++
 8 files changed, 175 insertions(+), 8 deletions(-)

diff --git a/Documentation/config/clone.txt b/Documentation/config/clone.txt
index 7bcfbd18a5..26f4fb137a 100644
--- a/Documentation/config/clone.txt
+++ b/Documentation/config/clone.txt
@@ -6,3 +6,8 @@ clone.defaultRemoteName::
 clone.rejectShallow::
 	Reject to clone a repository if it is a shallow one, can be overridden by
 	passing option `--reject-shallow` in command line. See linkgit:git-clone[1]
+
+clone.filterSubmodules::
+	If a partial clone filter is provided (see `--filter` in
+	linkgit:git-rev-list[1]) and `--recurse-submodules` is used, also apply
+	the filter to submodules.
diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt
index 984d194934..632bd1348e 100644
--- a/Documentation/git-clone.txt
+++ b/Documentation/git-clone.txt
@@ -16,7 +16,7 @@ SYNOPSIS
 	  [--depth <depth>] [--[no-]single-branch] [--no-tags]
 	  [--recurse-submodules[=<pathspec>]] [--[no-]shallow-submodules]
 	  [--[no-]remote-submodules] [--jobs <n>] [--sparse] [--[no-]reject-shallow]
-	  [--filter=<filter>] [--] <repository>
+	  [--filter=<filter> [--also-filter-submodules]] [--] <repository>
 	  [<directory>]
 
 DESCRIPTION
@@ -182,6 +182,11 @@ objects from the source repository into a pack in the cloned repository.
 	at least `<size>`. For more details on filter specifications, see
 	the `--filter` option in linkgit:git-rev-list[1].
 
+--also-filter-submodules::
+	Also apply the partial clone filter to any submodules in the repository.
+	Requires `--filter` and `--recurse-submodules`. This can be turned on by
+	default by setting the `clone.filterSubmodules` config option.
+
 --mirror::
 	Set up a mirror of the source repository.  This implies `--bare`.
 	Compared to `--bare`, `--mirror` not only maps local branches of the
diff --git a/Documentation/git-submodule.txt b/Documentation/git-submodule.txt
index 7e5f995f77..4d3ab6b9f9 100644
--- a/Documentation/git-submodule.txt
+++ b/Documentation/git-submodule.txt
@@ -133,7 +133,7 @@ If you really want to remove a submodule from the repository and commit
 that use linkgit:git-rm[1] instead. See linkgit:gitsubmodules[7] for removal
 options.
 
-update [--init] [--remote] [-N|--no-fetch] [--[no-]recommend-shallow] [-f|--force] [--checkout|--rebase|--merge] [--reference <repository>] [--depth <depth>] [--recursive] [--jobs <n>] [--[no-]single-branch] [--] [<path>...]::
+update [--init] [--remote] [-N|--no-fetch] [--[no-]recommend-shallow] [-f|--force] [--checkout|--rebase|--merge] [--reference <repository>] [--depth <depth>] [--recursive] [--jobs <n>] [--[no-]single-branch] [--filter <filter spec>] [--] [<path>...]::
 +
 --
 Update the registered submodules to match what the superproject
@@ -177,6 +177,10 @@ submodule with the `--init` option.
 
 If `--recursive` is specified, this command will recurse into the
 registered submodules, and update any nested submodules within.
+
+If `--filter <filter spec>` is specified, the given partial clone filter will be
+applied to the submodule. See linkgit:git-rev-list[1] for details on filter
+specifications.
 --
 set-branch (-b|--branch) <branch> [--] <path>::
 set-branch (-d|--default) [--] <path>::
diff --git a/builtin/clone.c b/builtin/clone.c
index 727e16e0ae..fb605e4c8d 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -71,6 +71,8 @@ static int option_dissociate;
 static int max_jobs = -1;
 static struct string_list option_recurse_submodules = STRING_LIST_INIT_NODUP;
 static struct list_objects_filter_options filter_options;
+static int option_filter_submodules = -1;    /* unspecified */
+static int config_filter_submodules = -1;    /* unspecified */
 static struct string_list server_options = STRING_LIST_INIT_NODUP;
 static int option_remote_submodules;
 
@@ -150,6 +152,8 @@ static struct option builtin_clone_options[] = {
 	OPT_SET_INT('6', "ipv6", &family, N_("use IPv6 addresses only"),
 			TRANSPORT_FAMILY_IPV6),
 	OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
+	OPT_BOOL(0, "also-filter-submodules", &option_filter_submodules,
+		    N_("apply partial clone filters to submodules")),
 	OPT_BOOL(0, "remote-submodules", &option_remote_submodules,
 		    N_("any cloned submodules will use their remote-tracking branch")),
 	OPT_BOOL(0, "sparse", &option_sparse_checkout,
@@ -650,7 +654,7 @@ static int git_sparse_checkout_init(const char *repo)
 	return result;
 }
 
-static int checkout(int submodule_progress)
+static int checkout(int submodule_progress, int filter_submodules)
 {
 	struct object_id oid;
 	char *head;
@@ -729,6 +733,10 @@ static int checkout(int submodule_progress)
 			strvec_push(&args, "--no-fetch");
 		}
 
+		if (filter_submodules && filter_options.choice)
+			strvec_pushf(&args, "--filter=%s",
+				     expand_list_objects_filter_spec(&filter_options));
+
 		if (option_single_branch >= 0)
 			strvec_push(&args, option_single_branch ?
 					       "--single-branch" :
@@ -749,6 +757,8 @@ static int git_clone_config(const char *k, const char *v, void *cb)
 	}
 	if (!strcmp(k, "clone.rejectshallow"))
 		config_reject_shallow = git_config_bool(k, v);
+	if (!strcmp(k, "clone.filtersubmodules"))
+		config_filter_submodules = git_config_bool(k, v);
 
 	return git_default_config(k, v, cb);
 }
@@ -871,6 +881,7 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
 	struct remote *remote;
 	int err = 0, complete_refs_before_fetch = 1;
 	int submodule_progress;
+	int filter_submodules = 0;
 
 	struct transport_ls_refs_options transport_ls_refs_options =
 		TRANSPORT_LS_REFS_OPTIONS_INIT;
@@ -1066,6 +1077,27 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
 	if (option_reject_shallow != -1)
 		reject_shallow = option_reject_shallow;
 
+	/*
+	 * If option_filter_submodules is specified from CLI option,
+	 * ignore config_filter_submodules from git_clone_config.
+	 */
+	if (config_filter_submodules != -1)
+		filter_submodules = config_filter_submodules;
+	if (option_filter_submodules != -1)
+		filter_submodules = option_filter_submodules;
+
+	/*
+	 * Exit if the user seems to be doing something silly with submodule
+	 * filter flags (but not with filter configs, as those should be
+	 * set-and-forget).
+	 */
+	if (option_filter_submodules > 0 && !filter_options.choice)
+		die(_("the option '%s' requires '%s'"),
+		    "--also-filter-submodules", "--filter");
+	if (option_filter_submodules > 0 && !option_recurse_submodules.nr)
+		die(_("the option '%s' requires '%s'"),
+		    "--also-filter-submodules", "--recurse-submodules");
+
 	/*
 	 * apply the remote name provided by --origin only after this second
 	 * call to git_config, to ensure it overrides all config-based values.
@@ -1299,7 +1331,7 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
 	}
 
 	junk_mode = JUNK_LEAVE_REPO;
-	err = checkout(submodule_progress);
+	err = checkout(submodule_progress, filter_submodules);
 
 	free(remote_name);
 	strbuf_release(&reflog_msg);
diff --git a/builtin/submodule--helper.c b/builtin/submodule--helper.c
index c5d3fc3817..11552970f2 100644
--- a/builtin/submodule--helper.c
+++ b/builtin/submodule--helper.c
@@ -20,6 +20,7 @@
 #include "diff.h"
 #include "object-store.h"
 #include "advice.h"
+#include "list-objects-filter-options.h"
 
 #define OPT_QUIET (1 << 0)
 #define OPT_CACHED (1 << 1)
@@ -1630,6 +1631,7 @@ struct module_clone_data {
 	const char *name;
 	const char *url;
 	const char *depth;
+	struct list_objects_filter_options *filter_options;
 	struct string_list reference;
 	unsigned int quiet: 1;
 	unsigned int progress: 1;
@@ -1796,6 +1798,10 @@ static int clone_submodule(struct module_clone_data *clone_data)
 			strvec_push(&cp.args, "--dissociate");
 		if (sm_gitdir && *sm_gitdir)
 			strvec_pushl(&cp.args, "--separate-git-dir", sm_gitdir, NULL);
+		if (clone_data->filter_options && clone_data->filter_options->choice)
+			strvec_pushf(&cp.args, "--filter=%s",
+				     expand_list_objects_filter_spec(
+					     clone_data->filter_options));
 		if (clone_data->single_branch >= 0)
 			strvec_push(&cp.args, clone_data->single_branch ?
 				    "--single-branch" :
@@ -1852,6 +1858,7 @@ static int module_clone(int argc, const char **argv, const char *prefix)
 {
 	int dissociate = 0, quiet = 0, progress = 0, require_init = 0;
 	struct module_clone_data clone_data = MODULE_CLONE_DATA_INIT;
+	struct list_objects_filter_options filter_options;
 
 	struct option module_clone_options[] = {
 		OPT_STRING(0, "prefix", &clone_data.prefix,
@@ -1881,17 +1888,19 @@ static int module_clone(int argc, const char **argv, const char *prefix)
 			   N_("disallow cloning into non-empty directory")),
 		OPT_BOOL(0, "single-branch", &clone_data.single_branch,
 			 N_("clone only one branch, HEAD or --branch")),
+		OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
 		OPT_END()
 	};
 
 	const char *const git_submodule_helper_usage[] = {
 		N_("git submodule--helper clone [--prefix=<path>] [--quiet] "
 		   "[--reference <repository>] [--name <name>] [--depth <depth>] "
-		   "[--single-branch] "
+		   "[--single-branch] [--filter <filter-spec>]"
 		   "--url <url> --path <path>"),
 		NULL
 	};
 
+	memset(&filter_options, 0, sizeof(filter_options));
 	argc = parse_options(argc, argv, prefix, module_clone_options,
 			     git_submodule_helper_usage, 0);
 
@@ -1899,12 +1908,14 @@ static int module_clone(int argc, const char **argv, const char *prefix)
 	clone_data.quiet = !!quiet;
 	clone_data.progress = !!progress;
 	clone_data.require_init = !!require_init;
+	clone_data.filter_options = &filter_options;
 
 	if (argc || !clone_data.url || !clone_data.path || !*(clone_data.path))
 		usage_with_options(git_submodule_helper_usage,
 				   module_clone_options);
 
 	clone_submodule(&clone_data);
+	list_objects_filter_release(&filter_options);
 	return 0;
 }
 
@@ -1994,6 +2005,7 @@ struct submodule_update_clone {
 	const char *recursive_prefix;
 	const char *prefix;
 	int single_branch;
+	struct list_objects_filter_options *filter_options;
 
 	/* to be consumed by git-submodule.sh */
 	struct update_clone_data *update_clone;
@@ -2154,6 +2166,9 @@ static int prepare_to_clone_next_submodule(const struct cache_entry *ce,
 		strvec_pushl(&child->args, "--prefix", suc->prefix, NULL);
 	if (suc->recommend_shallow && sub->recommend_shallow == 1)
 		strvec_push(&child->args, "--depth=1");
+	if (suc->filter_options && suc->filter_options->choice)
+		strvec_pushf(&child->args, "--filter=%s",
+			     expand_list_objects_filter_spec(suc->filter_options));
 	if (suc->require_init)
 		strvec_push(&child->args, "--require-init");
 	strvec_pushl(&child->args, "--path", sub->path, NULL);
@@ -2498,6 +2513,8 @@ static int update_clone(int argc, const char **argv, const char *prefix)
 	const char *update = NULL;
 	struct pathspec pathspec;
 	struct submodule_update_clone suc = SUBMODULE_UPDATE_CLONE_INIT;
+	struct list_objects_filter_options filter_options;
+	int ret;
 
 	struct option module_update_clone_options[] = {
 		OPT_STRING(0, "prefix", &prefix,
@@ -2528,6 +2545,7 @@ static int update_clone(int argc, const char **argv, const char *prefix)
 			   N_("disallow cloning into non-empty directory")),
 		OPT_BOOL(0, "single-branch", &suc.single_branch,
 			 N_("clone only one branch, HEAD or --branch")),
+		OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
 		OPT_END()
 	};
 
@@ -2540,20 +2558,26 @@ static int update_clone(int argc, const char **argv, const char *prefix)
 	update_clone_config_from_gitmodules(&suc.max_jobs);
 	git_config(git_update_clone_config, &suc.max_jobs);
 
+	memset(&filter_options, 0, sizeof(filter_options));
 	argc = parse_options(argc, argv, prefix, module_update_clone_options,
 			     git_submodule_helper_usage, 0);
+	suc.filter_options = &filter_options;
 
 	if (update)
 		if (parse_submodule_update_strategy(update, &suc.update) < 0)
 			die(_("bad value for update parameter"));
 
-	if (module_list_compute(argc, argv, prefix, &pathspec, &suc.list) < 0)
+	if (module_list_compute(argc, argv, prefix, &pathspec, &suc.list) < 0) {
+		list_objects_filter_release(&filter_options);
 		return 1;
+	}
 
 	if (pathspec.nr)
 		suc.warn_if_uninitialized = 1;
 
-	return update_submodules(&suc);
+	ret = update_submodules(&suc);
+	list_objects_filter_release(&filter_options);
+	return ret;
 }
 
 static int run_update_procedure(int argc, const char **argv, const char *prefix)
diff --git a/git-submodule.sh b/git-submodule.sh
index 652861aa66..87772ac891 100755
--- a/git-submodule.sh
+++ b/git-submodule.sh
@@ -10,7 +10,7 @@ USAGE="[--quiet] [--cached]
    or: $dashless [--quiet] status [--cached] [--recursive] [--] [<path>...]
    or: $dashless [--quiet] init [--] [<path>...]
    or: $dashless [--quiet] deinit [-f|--force] (--all| [--] <path>...)
-   or: $dashless [--quiet] update [--init] [--remote] [-N|--no-fetch] [-f|--force] [--checkout|--merge|--rebase] [--[no-]recommend-shallow] [--reference <repository>] [--recursive] [--[no-]single-branch] [--] [<path>...]
+   or: $dashless [--quiet] update [--init [--filter=<filter-spec>]] [--remote] [-N|--no-fetch] [-f|--force] [--checkout|--merge|--rebase] [--[no-]recommend-shallow] [--reference <repository>] [--recursive] [--[no-]single-branch] [--] [<path>...]
    or: $dashless [--quiet] set-branch (--default|--branch <branch>) [--] <path>
    or: $dashless [--quiet] set-url [--] <path> <newurl>
    or: $dashless [--quiet] summary [--cached|--files] [--summary-limit <n>] [commit] [--] [<path>...]
@@ -49,6 +49,7 @@ dissociate=
 single_branch=
 jobs=
 recommend_shallow=
+filter=
 
 die_if_unmatched ()
 {
@@ -347,6 +348,14 @@ cmd_update()
 		--no-single-branch)
 			single_branch="--no-single-branch"
 			;;
+		--filter)
+			case "$2" in '') usage ;; esac
+			filter="--filter=$2"
+			shift
+			;;
+		--filter=*)
+			filter="$1"
+			;;
 		--)
 			shift
 			break
@@ -361,6 +370,11 @@ cmd_update()
 		shift
 	done
 
+	if test -n "$filter" && test "$init" != "1"
+	then
+		usage
+	fi
+
 	if test -n "$init"
 	then
 		cmd_init "--" "$@" || return
@@ -379,6 +393,7 @@ cmd_update()
 		$single_branch \
 		$recommend_shallow \
 		$jobs \
+		$filter \
 		-- \
 		"$@" || echo "#unmatched" $?
 	} | {
diff --git a/t/t5617-clone-submodules-remote.sh b/t/t5617-clone-submodules-remote.sh
index e2dbb4eaba..ca8f80083a 100755
--- a/t/t5617-clone-submodules-remote.sh
+++ b/t/t5617-clone-submodules-remote.sh
@@ -28,6 +28,13 @@ test_expect_success 'setup' '
 	)
 '
 
+# bare clone giving "srv.bare" for use as our server.
+test_expect_success 'setup bare clone for server' '
+	git clone --bare "file://$(pwd)/." srv.bare &&
+	git -C srv.bare config --local uploadpack.allowfilter 1 &&
+	git -C srv.bare config --local uploadpack.allowanysha1inwant 1
+'
+
 test_expect_success 'clone with --no-remote-submodules' '
 	test_when_finished "rm -rf super_clone" &&
 	git clone --recurse-submodules --no-remote-submodules "file://$pwd/." super_clone &&
@@ -65,4 +72,38 @@ test_expect_success 'clone with --single-branch' '
 	)
 '
 
+# do basic partial clone from "srv.bare"
+# confirm partial clone was registered in the local config for super and sub.
+test_expect_success 'clone with --filter' '
+	git clone --recurse-submodules \
+		--filter blob:none --also-filter-submodules \
+		"file://$pwd/srv.bare" super_clone &&
+	test_cmp_config -C super_clone true remote.origin.promisor &&
+	test_cmp_config -C super_clone blob:none remote.origin.partialclonefilter &&
+	test_cmp_config -C super_clone/sub true remote.origin.promisor &&
+	test_cmp_config -C super_clone/sub blob:none remote.origin.partialclonefilter
+'
+
+# check that clone.filterSubmodules works (--also-filter-submodules can be
+# omitted)
+test_expect_success 'filters applied with clone.filterSubmodules' '
+	test_config_global clone.filterSubmodules true &&
+	git clone --recurse-submodules --filter blob:none \
+		"file://$pwd/srv.bare" super_clone2 &&
+	test_cmp_config -C super_clone2 true remote.origin.promisor &&
+	test_cmp_config -C super_clone2 blob:none remote.origin.partialclonefilter &&
+	test_cmp_config -C super_clone2/sub true remote.origin.promisor &&
+	test_cmp_config -C super_clone2/sub blob:none remote.origin.partialclonefilter
+'
+
+test_expect_success '--no-also-filter-submodules overrides clone.filterSubmodules=true' '
+	test_config_global clone.filterSubmodules true &&
+	git clone --recurse-submodules --filter blob:none \
+		--no-also-filter-submodules \
+		"file://$pwd/srv.bare" super_clone3 &&
+	test_cmp_config -C super_clone3 true remote.origin.promisor &&
+	test_cmp_config -C super_clone3 blob:none remote.origin.partialclonefilter &&
+	test_cmp_config -C super_clone3/sub false --default false remote.origin.promisor
+'
+
 test_done
diff --git a/t/t7814-grep-recurse-submodules.sh b/t/t7814-grep-recurse-submodules.sh
index 058e5d0c96..48ab7a05c4 100755
--- a/t/t7814-grep-recurse-submodules.sh
+++ b/t/t7814-grep-recurse-submodules.sh
@@ -544,4 +544,45 @@ test_expect_failure 'grep saves textconv cache in the appropriate repository' '
 	test_path_is_file "$sub_textconv_cache"
 '
 
+test_expect_success 'grep partially-cloned submodule' '
+	# Set up clean superproject and submodule for partial cloning.
+	git init super &&
+	git init super/sub &&
+	(
+		cd super &&
+		test_commit --no-tag "Add file in superproject" \
+			super-file "Some content for super-file" &&
+		test_commit -C sub --no-tag "Add file in submodule" \
+			sub-file "Some content for sub-file" &&
+		git submodule add ./sub &&
+		git commit -m "Add other as submodule sub" &&
+		test_tick &&
+		test_commit -C sub --no-tag --append "Update file in submodule" \
+			sub-file "Some more content for sub-file" &&
+		git add sub &&
+		git commit -m "Update submodule" &&
+		test_tick &&
+		git config --local uploadpack.allowfilter 1 &&
+		git config --local uploadpack.allowanysha1inwant 1 &&
+		git -C sub config --local uploadpack.allowfilter 1 &&
+		git -C sub config --local uploadpack.allowanysha1inwant 1
+	) &&
+	# Clone the superproject & submodule, then make sure we can lazy-fetch submodule objects.
+	git clone --filter=blob:none --also-filter-submodules \
+		--recurse-submodules "file://$(pwd)/super" partial &&
+	(
+		cd partial &&
+		cat >expect <<-\EOF &&
+		HEAD^:sub/sub-file:Some content for sub-file
+		HEAD^:super-file:Some content for super-file
+		EOF
+
+		GIT_TRACE2_EVENT="$(pwd)/trace2.log" git grep -e content \
+			--recurse-submodules HEAD^ >actual &&
+		test_cmp expect actual &&
+		# Verify that we actually fetched data from the promisor remote:
+		grep \"category\":\"promisor\",\"key\":\"fetch_count\",\"value\":\"1\" trace2.log >/dev/null
+	)
+'
+
 test_done

base-commit: b56bd95bbc8f716cb8cbb5fdc18b9b0f00323c6a
-- 
2.35.0.263.gb82422642f-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] clone, submodule: pass partial clone filters to submodules
  2022-02-05  5:00 ` [PATCH v3] " Josh Steadmon
@ 2022-02-09 22:44   ` Jonathan Tan
  2022-02-09 23:37     ` Junio C Hamano
  2022-02-19 17:30   ` [PATCH] t5617,t7814: remove unnecessary 'uploadpack.allowanysha1inwant' Philippe Blain
  1 sibling, 1 reply; 14+ messages in thread
From: Jonathan Tan @ 2022-02-09 22:44 UTC (permalink / raw)
  To: Josh Steadmon; +Cc: Jonathan Tan, git, gitster, newren

Josh Steadmon <steadmon@google.com> writes:
>  Documentation/config/clone.txt     |  5 ++++
>  Documentation/git-clone.txt        |  7 ++++-
>  Documentation/git-submodule.txt    |  6 ++++-
>  builtin/clone.c                    | 36 ++++++++++++++++++++++++--
>  builtin/submodule--helper.c        | 30 +++++++++++++++++++---
>  git-submodule.sh                   | 17 ++++++++++++-
>  t/t5617-clone-submodules-remote.sh | 41 ++++++++++++++++++++++++++++++
>  t/t7814-grep-recurse-submodules.sh | 41 ++++++++++++++++++++++++++++++
>  8 files changed, 175 insertions(+), 8 deletions(-)

Thanks for this patch. "clone" currently calls "submodule update" in
order to perform the clone in the submodule, and "submodule update" then
calls "submodule--helper", so I would expect changes in all 3 files.
Looking at the summary above, that indeed is the case.

> @@ -544,4 +544,45 @@ test_expect_failure 'grep saves textconv cache in the appropriate repository' '
>  	test_path_is_file "$sub_textconv_cache"
>  '
>  
> +test_expect_success 'grep partially-cloned submodule' '

[snip]

> +		# Verify that we actually fetched data from the promisor remote:
> +		grep \"category\":\"promisor\",\"key\":\"fetch_count\",\"value\":\"1\" trace2.log >/dev/null

No need to redirect to /dev/null, but probably not worth a reroll on its
own.

This patch looks good to me.
Reviewed-by: Jonathan Tan <jonathantanmy@google.com>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] clone, submodule: pass partial clone filters to submodules
  2022-02-09 22:44   ` Jonathan Tan
@ 2022-02-09 23:37     ` Junio C Hamano
  0 siblings, 0 replies; 14+ messages in thread
From: Junio C Hamano @ 2022-02-09 23:37 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: Josh Steadmon, git, newren

Jonathan Tan <jonathantanmy@google.com> writes:

> Josh Steadmon <steadmon@google.com> writes:
>>  Documentation/config/clone.txt     |  5 ++++
>>  Documentation/git-clone.txt        |  7 ++++-
>>  Documentation/git-submodule.txt    |  6 ++++-
>>  builtin/clone.c                    | 36 ++++++++++++++++++++++++--
>>  builtin/submodule--helper.c        | 30 +++++++++++++++++++---
>>  git-submodule.sh                   | 17 ++++++++++++-
>>  t/t5617-clone-submodules-remote.sh | 41 ++++++++++++++++++++++++++++++
>>  t/t7814-grep-recurse-submodules.sh | 41 ++++++++++++++++++++++++++++++
>>  8 files changed, 175 insertions(+), 8 deletions(-)
>
> Thanks for this patch. "clone" currently calls "submodule update" in
> order to perform the clone in the submodule, and "submodule update" then
> calls "submodule--helper", so I would expect changes in all 3 files.
> Looking at the summary above, that indeed is the case.
>
>> @@ -544,4 +544,45 @@ test_expect_failure 'grep saves textconv cache in the appropriate repository' '
>>  	test_path_is_file "$sub_textconv_cache"
>>  '
>>  
>> +test_expect_success 'grep partially-cloned submodule' '
>
> [snip]
>
>> +		# Verify that we actually fetched data from the promisor remote:
>> +		grep \"category\":\"promisor\",\"key\":\"fetch_count\",\"value\":\"1\" trace2.log >/dev/null
>
> No need to redirect to /dev/null, but probably not worth a reroll on its
> own.

I can strip it while queuing, then.

> This patch looks good to me.
> Reviewed-by: Jonathan Tan <jonathantanmy@google.com>

Agreed.  Thanks, both.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] t5617,t7814: remove unnecessary 'uploadpack.allowanysha1inwant'
  2022-02-05  5:00 ` [PATCH v3] " Josh Steadmon
  2022-02-09 22:44   ` Jonathan Tan
@ 2022-02-19 17:30   ` Philippe Blain
  1 sibling, 0 replies; 14+ messages in thread
From: Philippe Blain @ 2022-02-19 17:30 UTC (permalink / raw)
  To: git; +Cc: Josh Steadmon, Jonathan Tan

The tests added in the previous commit configure the "server"
repository with 'uploadpack.allowfilter', in order for it to act as a
promisor remote, and also with 'uploadpack.allowanysha1inwant'.

This second setting is unnecessary as it only affects protocol v0
operations; protocol v2, the default since eb049759fb (protocol:
re-enable v2 protocol by default, 2020-09-25), allows any OID in want
without any configuration needed.

Remove this config from both tests.

Signed-off-by: Philippe Blain <levraiphilippeblain@gmail.com>
---

Notes:
    This is based on the version in next: f05da2b48b (clone, submodule: pass
    partial clone filters to submodules, 2022-02-04).
    
    Both tests still pass.
    
    I wondered if it would be best to add '-c protocol.version=2' later on in the
    tests, to allow runnning these tests with GIT_TEST_PROTOCOL_VERSION, but I was
    not sure.
    
    Note that as I remarked in [1] last summer, the fact that
    'allow{Tip,Reachable,Any}Sha1InWant' have no effect under protocol v2 is still
    missing from the documentation...
    
    [1] https://lore.kernel.org/git/1a98c659-e7db-50a6-faf3-b3b4c15df679@gmail.com/

 t/t5617-clone-submodules-remote.sh | 3 +--
 t/t7814-grep-recurse-submodules.sh | 4 +---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/t/t5617-clone-submodules-remote.sh b/t/t5617-clone-submodules-remote.sh
index ca8f80083a..3fc93b1f8d 100755
--- a/t/t5617-clone-submodules-remote.sh
+++ b/t/t5617-clone-submodules-remote.sh
@@ -31,8 +31,7 @@ test_expect_success 'setup' '
 # bare clone giving "srv.bare" for use as our server.
 test_expect_success 'setup bare clone for server' '
 	git clone --bare "file://$(pwd)/." srv.bare &&
-	git -C srv.bare config --local uploadpack.allowfilter 1 &&
-	git -C srv.bare config --local uploadpack.allowanysha1inwant 1
+	git -C srv.bare config --local uploadpack.allowfilter 1
 '
 
 test_expect_success 'clone with --no-remote-submodules' '
diff --git a/t/t7814-grep-recurse-submodules.sh b/t/t7814-grep-recurse-submodules.sh
index a4476dc492..1c9aec06a3 100755
--- a/t/t7814-grep-recurse-submodules.sh
+++ b/t/t7814-grep-recurse-submodules.sh
@@ -563,9 +563,7 @@ test_expect_success 'grep partially-cloned submodule' '
 		git commit -m "Update submodule" &&
 		test_tick &&
 		git config --local uploadpack.allowfilter 1 &&
-		git config --local uploadpack.allowanysha1inwant 1 &&
-		git -C sub config --local uploadpack.allowfilter 1 &&
-		git -C sub config --local uploadpack.allowanysha1inwant 1
+		git -C sub config --local uploadpack.allowfilter 1
 	) &&
 	# Clone the superproject & submodule, then make sure we can lazy-fetch submodule objects.
 	git clone --filter=blob:none --also-filter-submodules \

base-commit: f05da2b48b48a46db65fc768b3ffecaf996dd655
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2022-02-19 17:30 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-21  3:32 [PATCH] clone, submodule: pass partial clone filters to submodules Josh Steadmon
2022-01-22  1:49 ` Junio C Hamano
2022-01-25 21:00   ` Elijah Newren
2022-01-26  6:03     ` Junio C Hamano
2022-02-01 21:33       ` Josh Steadmon
2022-01-25 21:08 ` Elijah Newren
2022-02-01 21:34   ` Josh Steadmon
2022-02-05  0:40 ` [PATCH v2] " Josh Steadmon
2022-02-05  0:54   ` Josh Steadmon
2022-02-05  1:00     ` Josh Steadmon
2022-02-05  5:00 ` [PATCH v3] " Josh Steadmon
2022-02-09 22:44   ` Jonathan Tan
2022-02-09 23:37     ` Junio C Hamano
2022-02-19 17:30   ` [PATCH] t5617,t7814: remove unnecessary 'uploadpack.allowanysha1inwant' Philippe Blain

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).