git@vger.kernel.org list mirror (unofficial, one of many)
 help / color / Atom feed
* [PATCH 1/9] sparse-checkout: create builtin with 'list' subcommand
  2019-08-20 15:11 [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode Derrick Stolee via GitGitGadget
@ 2019-08-20 15:11 ` Derrick Stolee via GitGitGadget
  2019-08-23 22:30   ` Elijah Newren
  2019-08-20 15:11 ` [PATCH 2/9] sparse-checkout: create 'init' subcommand Derrick Stolee via GitGitGadget
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-08-20 15:11 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The sparse-checkout feature is mostly hidden to users, as its
only documentation is supplementary information in the docs for
'git read-tree'. In addition, users need to know how to edit the
.git/info/sparse-checkout file with the right patterns, then run
the appropriate 'git read-tree -mu HEAD' command. Keeping the
working directory in sync with the sparse-checkout file requires
care.

Begin an effort to make the sparse-checkout feature a porcelain
feature by creating a new 'git sparse-checkout' builtin. This
builtin will be the preferred mechanism for manipulating the
sparse-checkout file and syncing the working directory.

For now, create the basics of the builtin. Includes a single
subcommand, "git sparse-checkout list", that lists the patterns
currently in the sparse-checkout file. Test that these patterns
are parsed and written correctly to the output.

The documentation provided is adapted from the "git read-tree"
documentation with a few edits for clarity in the new context.
Extra sections are added to hint toward a future change to
a moer restricted pattern set.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 .gitignore                            |  1 +
 Documentation/git-read-tree.txt       |  2 +-
 Documentation/git-sparse-checkout.txt | 90 +++++++++++++++++++++++++++
 Makefile                              |  1 +
 builtin.h                             |  1 +
 builtin/sparse-checkout.c             | 90 +++++++++++++++++++++++++++
 git.c                                 |  1 +
 t/t1091-sparse-checkout-builtin.sh    | 50 +++++++++++++++
 8 files changed, 235 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/git-sparse-checkout.txt
 create mode 100644 builtin/sparse-checkout.c
 create mode 100755 t/t1091-sparse-checkout-builtin.sh

diff --git a/.gitignore b/.gitignore
index 521d8f4fb4..9411522c4f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -158,6 +158,7 @@
 /git-show-branch
 /git-show-index
 /git-show-ref
+/git-sparse-checkout
 /git-stage
 /git-stash
 /git-status
diff --git a/Documentation/git-read-tree.txt b/Documentation/git-read-tree.txt
index d271842608..da33f84f33 100644
--- a/Documentation/git-read-tree.txt
+++ b/Documentation/git-read-tree.txt
@@ -436,7 +436,7 @@ support.
 SEE ALSO
 --------
 linkgit:git-write-tree[1]; linkgit:git-ls-files[1];
-linkgit:gitignore[5]
+linkgit:gitignore[5]; linkgit:git-sparse-checkout[1];
 
 GIT
 ---
diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
new file mode 100644
index 0000000000..ca0ca6a12f
--- /dev/null
+++ b/Documentation/git-sparse-checkout.txt
@@ -0,0 +1,90 @@
+git-sparse-checkout(1)
+=======================
+
+NAME
+----
+git-sparse-checkout - Initialize and modify the sparse-checkout
+configuration, which reduces the checkout to a set of directories
+given by a list of prefixes.
+
+
+SYNOPSIS
+--------
+[verse]
+'git sparse-checkout <subcommand> [options]'
+
+
+DESCRIPTION
+-----------
+
+Initialize and modify the sparse-checkout configuration, which reduces
+the checkout to a set of directories given by a list of prefixes.
+
+
+COMMANDS
+--------
+'list'::
+	Provide a list of the contents in the sparse-checkout file.
+
+
+SPARSE CHECKOUT
+----------------
+
+"Sparse checkout" allows populating the working directory sparsely.
+It uses the skip-worktree bit (see linkgit:git-update-index[1]) to tell
+Git whether a file in the working directory is worth looking at. If
+the skip-worktree bit is set, then the file is ignored in the working
+directory. Git will not populate the contents of those files, which
+makes a sparse checkout helpful when working in a repository with many
+files, but only a few are important to the current user.
+
+The `$GIT_DIR/info/sparse-checkout` file is used to define the
+skip-worktree reference bitmap. When Git updates the working
+directory, it resets the skip-worktree bit in the index based on this
+file. If an entry
+matches a pattern in this file, skip-worktree will not be set on
+that entry. Otherwise, skip-worktree will be set.
+
+Then it compares the new skip-worktree value with the previous one. If
+skip-worktree turns from set to unset, it will add the corresponding
+file back. If it turns from unset to set, that file will be removed.
+
+## FULL PATTERN SET
+
+By default, the sparse-checkout file uses the same syntax as `.gitignore`
+files.
+
+While `$GIT_DIR/info/sparse-checkout` is usually used to specify what
+files are in, you can also specify what files are _not_ in, using
+negate patterns. For example, to remove the file `unwanted`:
+
+----------------
+/*
+!unwanted
+----------------
+
+Another tricky thing is fully repopulating the working directory when you
+no longer want sparse checkout. You cannot just disable "sparse
+checkout" because skip-worktree bits are still in the index and your working
+directory is still sparsely populated. You should re-populate the working
+directory with the `$GIT_DIR/info/sparse-checkout` file content as
+follows:
+
+----------------
+/*
+----------------
+
+Then you can disable sparse checkout. Sparse checkout support in 'git
+read-tree' and similar commands is disabled by default. You need to
+set `core.sparseCheckout` to `true` in order to have sparse checkout
+support.
+
+SEE ALSO
+--------
+
+linkgit:git-read-tree[1]
+linkgit:gitignore[5]
+
+GIT
+---
+Part of the linkgit:git[1] suite
diff --git a/Makefile b/Makefile
index f9255344ae..d1d067ab65 100644
--- a/Makefile
+++ b/Makefile
@@ -1126,6 +1126,7 @@ BUILTIN_OBJS += builtin/shortlog.o
 BUILTIN_OBJS += builtin/show-branch.o
 BUILTIN_OBJS += builtin/show-index.o
 BUILTIN_OBJS += builtin/show-ref.o
+BUILTIN_OBJS += builtin/sparse-checkout.o
 BUILTIN_OBJS += builtin/stash.o
 BUILTIN_OBJS += builtin/stripspace.o
 BUILTIN_OBJS += builtin/submodule--helper.o
diff --git a/builtin.h b/builtin.h
index 5cf5df69f7..2b25a80cde 100644
--- a/builtin.h
+++ b/builtin.h
@@ -225,6 +225,7 @@ int cmd_shortlog(int argc, const char **argv, const char *prefix);
 int cmd_show(int argc, const char **argv, const char *prefix);
 int cmd_show_branch(int argc, const char **argv, const char *prefix);
 int cmd_show_index(int argc, const char **argv, const char *prefix);
+int cmd_sparse_checkout(int argc, const char **argv, const char *prefix);
 int cmd_status(int argc, const char **argv, const char *prefix);
 int cmd_stash(int argc, const char **argv, const char *prefix);
 int cmd_stripspace(int argc, const char **argv, const char *prefix);
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
new file mode 100644
index 0000000000..6477a6ed9c
--- /dev/null
+++ b/builtin/sparse-checkout.c
@@ -0,0 +1,90 @@
+#include "builtin.h"
+#include "config.h"
+#include "dir.h"
+#include "parse-options.h"
+#include "pathspec.h"
+#include "repository.h"
+#include "run-command.h"
+#include "strbuf.h"
+
+static char const * const builtin_sparse_checkout_usage[] = {
+	N_("git sparse-checkout [list]"),
+	NULL
+};
+
+struct opts_sparse_checkout {
+	const char *subcommand;
+	int read_stdin;
+} opts;
+
+static char *get_sparse_checkout_filename(void)
+{
+	return git_pathdup("info/sparse-checkout");
+}
+
+static void write_excludes_to_file(FILE *fp, struct exclude_list *el)
+{
+	int i;
+
+	for (i = 0; i < el->nr; i++) {
+		struct exclude *x = el->excludes[i];
+
+		if (x->flags & EXC_FLAG_NEGATIVE)
+			fprintf(fp, "!");
+
+		fprintf(fp, "%s", x->pattern);
+
+		if (x->flags & EXC_FLAG_MUSTBEDIR)
+			fprintf(fp, "/");
+
+		fprintf(fp, "\n");
+	}
+}
+
+static int sparse_checkout_list(int argc, const char **argv)
+{
+	struct exclude_list el;
+	char *sparse_filename;
+	int res;
+
+	memset(&el, 0, sizeof(el));
+
+	sparse_filename = get_sparse_checkout_filename();
+	res = add_excludes_from_file_to_list(sparse_filename, "", 0, &el, NULL);
+	free(sparse_filename);
+
+	if (res < 0) {
+		warning(_("failed to parse sparse-checkout file; it may not exist"));
+		return 0;
+	}
+
+	write_excludes_to_file(stdout, &el);
+	clear_exclude_list(&el);
+
+	return 0;
+}
+
+int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
+{
+	static struct option builtin_sparse_checkout_options[] = {
+		OPT_END(),
+	};
+
+	if (argc == 2 && !strcmp(argv[1], "-h"))
+		usage_with_options(builtin_sparse_checkout_usage,
+				   builtin_sparse_checkout_options);
+
+	git_config(git_default_config, NULL);
+	argc = parse_options(argc, argv, prefix,
+			     builtin_sparse_checkout_options,
+			     builtin_sparse_checkout_usage,
+			     PARSE_OPT_STOP_AT_NON_OPTION);
+
+	if (argc > 0) {
+		if (!strcmp(argv[0], "list"))
+			return sparse_checkout_list(argc, argv);
+	}
+
+	usage_with_options(builtin_sparse_checkout_usage,
+			   builtin_sparse_checkout_options);
+}
diff --git a/git.c b/git.c
index c1ee7124ed..dfb680e15d 100644
--- a/git.c
+++ b/git.c
@@ -573,6 +573,7 @@ static struct cmd_struct commands[] = {
 	{ "show-branch", cmd_show_branch, RUN_SETUP },
 	{ "show-index", cmd_show_index },
 	{ "show-ref", cmd_show_ref, RUN_SETUP },
+	{ "sparse-checkout", cmd_sparse_checkout, RUN_SETUP | NEED_WORK_TREE },
 	{ "stage", cmd_add, RUN_SETUP | NEED_WORK_TREE },
 	/*
 	 * NEEDSWORK: Until the builtin stash is thoroughly robust and no
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
new file mode 100755
index 0000000000..ba6928c641
--- /dev/null
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+test_description='sparse checkout builtin tests'
+
+. ./test-lib.sh
+
+test_expect_success 'setup' '
+	git init repo &&
+	(
+		cd repo &&
+		echo "initial" >a &&
+		mkdir folder1 folder2 deep &&
+		mkdir deep/deeper1 deep/deeper2 &&
+		mkdir deep/deeper1/deepest &&
+		cp a folder1 &&
+		cp a folder2 &&
+		cp a deep &&
+		cp a deep/deeper1 &&
+		cp a deep/deeper2 &&
+		cp a deep/deeper1/deepest &&
+		git add . &&
+		git commit -m "initial commit"
+	)
+'
+
+test_expect_success 'git sparse-checkout list (empty)' '
+	git -C repo sparse-checkout list >list 2>err &&
+	test_line_count = 0 list &&
+	test_i18ngrep "failed to parse sparse-checkout file; it may not exist" err
+'
+
+test_expect_success 'git sparse-checkout list (populated)' '
+	test_when_finished rm -f repo/.git/info/sparse-checkout &&
+	cat >repo/.git/info/sparse-checkout <<-EOF &&
+		/folder1/*
+		/deep/
+		**/a
+		!*bin*
+	EOF
+	git -C repo sparse-checkout list >list &&
+	cat >expect <<-EOF &&
+		/folder1/*
+		/deep/
+		**/a
+		!*bin*
+	EOF
+	test_cmp expect list
+'
+
+test_done
\ No newline at end of file
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode
@ 2019-08-20 15:11 Derrick Stolee via GitGitGadget
  2019-08-20 15:11 ` [PATCH 1/9] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
                   ` (10 more replies)
  0 siblings, 11 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-08-20 15:11 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano

This RFC includes a potential direction to make the sparse-checkout more
user-friendly. While there, I also present a way to use a limited set of
patterns to gain a significant performance boost in very large repositories.

Sparse-checkout is only documented as a subsection of the read-tree docs
[1], which makes the feature hard to discover. Users have trouble navigating
the feature, especially at clone time [2], and have even resorted to
creating their own helper tools [3].

This RFC attempts to solve these problems using a new builtin. Here is a
sample workflow to give a feeling for how it can work:

In an existing repo:

$ git sparse-checkout init
$ ls
myFile1.txt myFile2.txt
$ git sparse-checkout add
/myFolder/*
^D
$ ls
myFile1.txt myFile2.txt myFolder
$ ls myFolder
a.c a.h
$ git sparse-checkout disable
$ ls
hiddenFolder myFile1.txt myFile2.txt myFolder

At clone time:

$ git clone --sparse origin repo
$ cd repo
$ ls
myFile1.txt myFile2.txt
$ git sparse-checkout add
/myFolder/*
^D
$ ls
myFile1.txt myFile2.txt myFolder

Here are some more specific details:

 * git sparse-checkout init enables core.sparseCheckout and populates the
   sparse-checkout file with patterns that match only the files at root.
   
   
 * git clone learns the --sparse argument to run git sparse-checkout init 
   before the first checkout.
   
   
 * git sparse-checkout add reads patterns from stdin, one per line, then
   adds them to the sparse-checkout file and refreshes the working
   directory.
   
   
 * git sparse-checkout disable removes the patterns from the sparse-checkout
   file, disables core.sparseCheckout, and refills the working directory.
   
   
 * git sparse-checkout list lists the contents of the sparse-checkout file.
   
   

The documentation for the sparse-checkout feature can now live primarily
with the git-sparse-checkout documentation.

Cone Mode
=========

What really got me interested in this area is a performance problem. If we
have N patterns in the sparse-checkout file and M entries in the index, then
we can perform up to O(N * M) pattern checks in clear_ce_flags(). This
quadratic growth is not sustainable in a repo with 1,000+ patterns and
1,000,000+ index entries.

To solve this problem, I propose a new, more restrictive mode to
sparse-checkout: "cone mode". In this mode, all patterns are based on prefix
matches at a directory level. This can then use hashsets for fast
performance -- O(M) instead of O(N*M). My hashset implementation is based on
the virtual filesystem hook in the VFS for Git custom code [4].

In cone mode, a user specifies a list of folders which the user wants every
file inside. In addition, the cone adds all blobs that are siblings of the
folders in the directory path to that folder. This makes the directories
look "hydrated" as a user drills down to those recursively-closed folders.
These directories are called "parent" folders, as a file matches them only
if the file's immediate parent is that directory.

When building a prototype of this feature, I used a separate file to contain
the list of recursively-closed folders and built the hashsets dynamically
based on that file. In this implementation, I tried to maximize the amount
of backwards-compatibility by storing all data in the sparse-checkout file
using patterns recognized by earlier Git versions.

For example, if we add A/B/C as a recursive folder, then we add the
following patterns to the sparse-checkout file:

/*
!/*/*
/A/*
!/A/*/*
/A/B/*
!/A/B/*/*
/A/B/C/*

The alternating positive/negative patterns say "include everything in this
folder, but exclude everything another level deeper". The final pattern has
no matching negation, so is a recursively closed pattern.

Note that I have some basic warnings to try and check that the
sparse-checkout file doesn't match what would be written by a cone-mode add.
In such a case, Git writes a warning to stderr and continues with the old
pattern matching algorithm. These checks are currently very barebones, and
would need to be updated with more robust checks for things like regex
characters in the middle of the pattern. As review moves forward (and if we
don't change the data storage) then we could spend more time on this.

Thanks, -Stolee

[1] https://git-scm.com/docs/git-read-tree#_sparse_checkoutSparse-checkout
documentation in git-read-tree.

[2] https://stackoverflow.com/a/4909267/127088Is it possible to do a sparse
checkout without checking out the whole repository first?

[3] http://www.marcoyuen.com/articles/2016/06/07/git-sparse.htmlA blog post
of a user's extra "git-sparse" helper.

[4] 
https://github.com/git/git/compare/fc5fd706ff733392053e6180086a4d7f96acc2af...01204f24c5349aa2fb0c474546d768946d315dab
The virtual filesystem hook in microsoft/git.

Derrick Stolee (8):
  sparse-checkout: create builtin with 'list' subcommand
  sparse-checkout: create 'init' subcommand
  clone: add --sparse mode
  sparse-checkout: 'add' subcommand
  sparse-checkout: create 'disable' subcommand
  sparse-checkout: add 'cone' mode
  sparse-checkout: use hashmaps for cone patterns
  sparse-checkout: init and add in cone mode

Jeff Hostetler (1):
  trace2:experiment: clear_ce_flags_1

 .gitignore                            |   1 +
 Documentation/config/core.txt         |   7 +-
 Documentation/git-clone.txt           |   8 +-
 Documentation/git-read-tree.txt       |   2 +-
 Documentation/git-sparse-checkout.txt | 146 ++++++++++
 Makefile                              |   1 +
 builtin.h                             |   1 +
 builtin/clone.c                       |  27 ++
 builtin/sparse-checkout.c             | 389 ++++++++++++++++++++++++++
 cache.h                               |   8 +-
 config.c                              |  10 +-
 dir.c                                 | 154 +++++++++-
 dir.h                                 |  27 ++
 environment.c                         |   2 +-
 git.c                                 |   1 +
 t/t1091-sparse-checkout-builtin.sh    | 195 +++++++++++++
 unpack-trees.c                        |  12 +-
 17 files changed, 976 insertions(+), 15 deletions(-)
 create mode 100644 Documentation/git-sparse-checkout.txt
 create mode 100644 builtin/sparse-checkout.c
 create mode 100755 t/t1091-sparse-checkout-builtin.sh


base-commit: 5fa0f5238b0cd46cfe7f6fa76c3f526ea98148d9
Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-316%2Fderrickstolee%2Fsparse-checkout%2Fupstream-v1
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-316/derrickstolee/sparse-checkout/upstream-v1
Pull-Request: https://github.com/gitgitgadget/git/pull/316
-- 
gitgitgadget

^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH 2/9] sparse-checkout: create 'init' subcommand
  2019-08-20 15:11 [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode Derrick Stolee via GitGitGadget
  2019-08-20 15:11 ` [PATCH 1/9] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
@ 2019-08-20 15:11 ` Derrick Stolee via GitGitGadget
  2019-08-23 23:02   ` Elijah Newren
  2019-08-20 15:11 ` [PATCH 3/9] clone: add --sparse mode Derrick Stolee via GitGitGadget
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-08-20 15:11 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

Getting started with a sparse-checkout file can be daunting. Help
users start their sparse enlistment using 'git sparse-checkout init'.
This will set 'core.sparseCheckout=true' in their config, write
an initial set of patterns to the sparse-checkout file, and update
their working directory.

Using 'git read-tree' to clear directories does not work cleanly
on Windows, so manually delete directories that are tracked by Git
before running read-tree.

The use of running another process for 'git read-tree' is likely
suboptimal, but that can be improved in a later change, if valuable.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-sparse-checkout.txt |   7 ++
 builtin/sparse-checkout.c             | 106 +++++++++++++++++++++++++-
 t/t1091-sparse-checkout-builtin.sh    |  40 ++++++++++
 3 files changed, 152 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index ca0ca6a12f..50c53ee60a 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -26,6 +26,13 @@ COMMANDS
 'list'::
 	Provide a list of the contents in the sparse-checkout file.
 
+'init'::
+	Enable the `core.sparseCheckout` setting. If the
+	sparse-checkout file does not exist, then populate it with
+	patterns that match every file in the root directory and
+	no other directories, then will remove all directories tracked
+	by Git. Add patterns to the sparse-checkout file to
+	repopulate the working directory.
 
 SPARSE CHECKOUT
 ----------------
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 6477a6ed9c..86d24e6295 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -8,7 +8,7 @@
 #include "strbuf.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
-	N_("git sparse-checkout [list]"),
+	N_("git sparse-checkout [init|list]"),
 	NULL
 };
 
@@ -64,6 +64,108 @@ static int sparse_checkout_list(int argc, const char **argv)
 	return 0;
 }
 
+static int sc_read_tree(void)
+{
+	struct argv_array argv = ARGV_ARRAY_INIT;
+	int result = 0;
+	argv_array_pushl(&argv, "read-tree", "-m", "-u", "HEAD", NULL);
+
+	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to update index with new sparse-checkout paths"));
+		result = 1;
+	}
+
+	argv_array_clear(&argv);
+	return result;
+}
+
+static int sc_enable_config(void)
+{
+	struct argv_array argv = ARGV_ARRAY_INIT;
+	int result = 0;
+	argv_array_pushl(&argv, "config", "--add", "core.sparseCheckout", "true", NULL);
+
+	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to enable core.sparseCheckout"));
+		result = 1;
+	}
+
+	argv_array_clear(&argv);
+	return result;
+}
+
+static int delete_directory(const struct object_id *oid, struct strbuf *base,
+		const char *pathname, unsigned mode, int stage, void *context)
+{
+	struct strbuf dirname = STRBUF_INIT;
+	struct stat sb;
+
+	strbuf_addstr(&dirname, the_repository->worktree);
+	strbuf_addch(&dirname, '/');
+	strbuf_addstr(&dirname, pathname);
+
+	if (stat(dirname.buf, &sb) || !(sb.st_mode & S_IFDIR))
+		return 0;
+
+	if (remove_dir_recursively(&dirname, 0))
+		warning(_("failed to remove directory '%s'"),
+			dirname.buf);
+
+	strbuf_release(&dirname);
+	return 0;
+}
+
+static int sparse_checkout_init(int argc, const char **argv)
+{
+	struct tree *t;
+	struct object_id oid;
+	struct exclude_list el;
+	static struct pathspec pathspec;
+	char *sparse_filename;
+	FILE *fp;
+	int res;
+
+	if (sc_enable_config())
+		return 1;
+
+	memset(&el, 0, sizeof(el));
+
+	sparse_filename = get_sparse_checkout_filename();
+	res = add_excludes_from_file_to_list(sparse_filename, "", 0, &el, NULL);
+
+	/* If we already have a sparse-checkout file, use it. */
+	if (res >= 0) {
+		free(sparse_filename);
+		goto reset_dir;
+	}
+
+	/* initial mode: all blobs at root */
+	fp = fopen(sparse_filename, "w");
+	free(sparse_filename);
+	fprintf(fp, "/*\n!/*/*\n");
+	fclose(fp);
+
+	/* remove all directories in the root, if tracked by Git */
+	if (get_oid("HEAD", &oid)) {
+		/* assume we are in a fresh repo */
+		return 0;
+	}
+
+	t = parse_tree_indirect(&oid);
+
+	parse_pathspec(&pathspec, PATHSPEC_ALL_MAGIC &
+				  ~(PATHSPEC_FROMTOP | PATHSPEC_LITERAL),
+		       PATHSPEC_PREFER_CWD,
+		       "", NULL);
+
+	if (read_tree_recursive(the_repository, t, "", 0, 0, &pathspec,
+				delete_directory, NULL))
+		return 1;
+
+reset_dir:
+	return sc_read_tree();
+}
+
 int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 {
 	static struct option builtin_sparse_checkout_options[] = {
@@ -83,6 +185,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 	if (argc > 0) {
 		if (!strcmp(argv[0], "list"))
 			return sparse_checkout_list(argc, argv);
+		if (!strcmp(argv[0], "init"))
+			return sparse_checkout_init(argc, argv);
 	}
 
 	usage_with_options(builtin_sparse_checkout_usage,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index ba6928c641..35ab84aabd 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -47,4 +47,44 @@ test_expect_success 'git sparse-checkout list (populated)' '
 	test_cmp expect list
 '
 
+test_expect_success 'git sparse-checkout init' '
+	git -C repo sparse-checkout init &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/*
+	EOF
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	git -C repo config --list >config &&
+	test_i18ngrep "core.sparsecheckout=true" config &&
+	ls repo >dir  &&
+	echo a >expect &&
+	test_cmp expect dir
+'
+
+test_expect_success 'git sparse-checkout list after init' '
+	git -C repo sparse-checkout list >actual &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/*
+	EOF
+	test_cmp expect actual
+'
+
+test_expect_success 'init with existing sparse-checkout' '
+	echo "/folder1/*" >> repo/.git/info/sparse-checkout &&
+	git -C repo sparse-checkout init &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/*
+		/folder1/*
+	EOF
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+	EOF
+	test_cmp expect dir
+'
+
 test_done
\ No newline at end of file
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH 3/9] clone: add --sparse mode
  2019-08-20 15:11 [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode Derrick Stolee via GitGitGadget
  2019-08-20 15:11 ` [PATCH 1/9] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
  2019-08-20 15:11 ` [PATCH 2/9] sparse-checkout: create 'init' subcommand Derrick Stolee via GitGitGadget
@ 2019-08-20 15:11 ` Derrick Stolee via GitGitGadget
  2019-08-23 23:17   ` Elijah Newren
  2019-08-20 15:11 ` [PATCH 4/9] sparse-checkout: 'add' subcommand Derrick Stolee via GitGitGadget
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-08-20 15:11 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

When someone wants to clone a large repository, but plans to work
using a sparse-checkout file, they either need to do a full
checkout first and then reduce the patterns they included, or
clone with --no-checkout, set up their patterns, and then run
a checkout manually. This requires knowing a lot about the repo
shape and how sparse-checkout works.

Add a new '--sparse' option to 'git clone' that initializes the
sparse-checkout file to include the following patterns:

	/*
	!/*/*

These patterns include every file in the root directory, but
no directories. This allows a repo to include files like a
README or a bootstrapping script to grow enlistments from that
point.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-clone.txt        |  8 +++++++-
 builtin/clone.c                    | 27 +++++++++++++++++++++++++++
 t/t1091-sparse-checkout-builtin.sh | 13 +++++++++++++
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt
index 34011c2940..0fe91d2f04 100644
--- a/Documentation/git-clone.txt
+++ b/Documentation/git-clone.txt
@@ -15,7 +15,7 @@ SYNOPSIS
 	  [--dissociate] [--separate-git-dir <git dir>]
 	  [--depth <depth>] [--[no-]single-branch] [--no-tags]
 	  [--recurse-submodules[=<pathspec>]] [--[no-]shallow-submodules]
-	  [--[no-]remote-submodules] [--jobs <n>] [--] <repository>
+	  [--[no-]remote-submodules] [--jobs <n>] [--sparse] [--] <repository>
 	  [<directory>]
 
 DESCRIPTION
@@ -156,6 +156,12 @@ objects from the source repository into a pack in the cloned repository.
 	used, neither remote-tracking branches nor the related
 	configuration variables are created.
 
+--sparse::
+	Initialize the sparse-checkout file so the working
+	directory starts with only the files in the root
+	of the repository. The sparse-checkout file can be
+	modified to grow the working directory as needed.
+
 --mirror::
 	Set up a mirror of the source repository.  This implies `--bare`.
 	Compared to `--bare`, `--mirror` not only maps local branches of the
diff --git a/builtin/clone.c b/builtin/clone.c
index f665b28ccc..d6d49a73ff 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -60,6 +60,7 @@ static const char *real_git_dir;
 static char *option_upload_pack = "git-upload-pack";
 static int option_verbosity;
 static int option_progress = -1;
+static int option_sparse_checkout;
 static enum transport_family family;
 static struct string_list option_config = STRING_LIST_INIT_NODUP;
 static struct string_list option_required_reference = STRING_LIST_INIT_NODUP;
@@ -147,6 +148,8 @@ static struct option builtin_clone_options[] = {
 	OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
 	OPT_BOOL(0, "remote-submodules", &option_remote_submodules,
 		    N_("any cloned submodules will use their remote-tracking branch")),
+	OPT_BOOL(0, "sparse", &option_sparse_checkout,
+		    N_("initialize sparse-checkout file to include only files at root")),
 	OPT_END()
 };
 
@@ -734,6 +737,27 @@ static void update_head(const struct ref *our, const struct ref *remote,
 	}
 }
 
+static int git_sparse_checkout_init(const char *repo)
+{
+	struct argv_array argv = ARGV_ARRAY_INIT;
+	int result = 0;
+	argv_array_pushl(&argv, "-C", repo, "sparse-checkout", "init", NULL);
+
+	/*
+	 * We must apply the setting in the current process
+	 * for the later checkout to use the sparse-checkout file.
+	 */
+	core_apply_sparse_checkout = 1;
+
+	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to initialize sparse-checkout"));
+		result = 1;
+	}
+
+	argv_array_clear(&argv);
+	return result;
+}
+
 static int checkout(int submodule_progress)
 {
 	struct object_id oid;
@@ -1107,6 +1131,9 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
 	if (option_required_reference.nr || option_optional_reference.nr)
 		setup_reference();
 
+	if (option_sparse_checkout && git_sparse_checkout_init(repo))
+		return 1;
+
 	remote = remote_get(option_origin);
 
 	strbuf_addf(&default_refspec, "+%s*:%s*", src_ref_prefix,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 35ab84aabd..b7d5f15830 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -87,4 +87,17 @@ test_expect_success 'init with existing sparse-checkout' '
 	test_cmp expect dir
 '
 
+test_expect_success 'clone --sparse' '
+	git clone --sparse repo clone &&
+	git -C clone sparse-checkout list >actual &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/*
+	EOF
+	test_cmp expect actual &&
+	ls clone >dir &&
+	echo a >expect &&
+	test_cmp expect dir
+'
+
 test_done
\ No newline at end of file
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH 4/9] sparse-checkout: 'add' subcommand
  2019-08-20 15:11 [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode Derrick Stolee via GitGitGadget
                   ` (2 preceding siblings ...)
  2019-08-20 15:11 ` [PATCH 3/9] clone: add --sparse mode Derrick Stolee via GitGitGadget
@ 2019-08-20 15:11 ` Derrick Stolee via GitGitGadget
  2019-08-23 23:30   ` Elijah Newren
  2019-08-20 15:11 ` [PATCH 6/9] trace2:experiment: clear_ce_flags_1 Jeff Hostetler via GitGitGadget
                   ` (6 subsequent siblings)
  10 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-08-20 15:11 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The 'git sparse-checkout add' subcommand takes a list of patterns
over stdin and writes them to the sparse-checkout file. Then, it
updates the working directory using 'git read-tree -mu HEAD'.

Note: if a user adds a negative pattern that would lead to the
removal of a non-empty directory, then Git may not delete that
directory (on Windows).

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-sparse-checkout.txt |  4 ++++
 builtin/sparse-checkout.c             | 32 ++++++++++++++++++++++++++-
 t/t1091-sparse-checkout-builtin.sh    | 20 +++++++++++++++++
 3 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index 50c53ee60a..6f540a3443 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -34,6 +34,10 @@ COMMANDS
 	by Git. Add patterns to the sparse-checkout file to
 	repopulate the working directory.
 
+'add'::
+	Add a set of patterns to the sparse-checkout file, as given over
+	stdin. Updates the working directory to match the new patterns.
+
 SPARSE CHECKOUT
 ----------------
 
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 86d24e6295..ec6134fecc 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -8,7 +8,7 @@
 #include "strbuf.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
-	N_("git sparse-checkout [init|list]"),
+	N_("git sparse-checkout [init|add|list]"),
 	NULL
 };
 
@@ -166,6 +166,34 @@ static int sparse_checkout_init(int argc, const char **argv)
 	return sc_read_tree();
 }
 
+static int sparse_checkout_add(int argc, const char **argv)
+{
+	struct exclude_list el;
+	char *sparse_filename;
+	FILE *fp;
+	struct strbuf line = STRBUF_INIT;
+
+	memset(&el, 0, sizeof(el));
+
+	sparse_filename = get_sparse_checkout_filename();
+	add_excludes_from_file_to_list(sparse_filename, "", 0, &el, NULL);
+
+	fp = fopen(sparse_filename, "w");
+	write_excludes_to_file(fp, &el);
+
+	while (!strbuf_getline(&line, stdin)) {
+		strbuf_trim(&line);
+		fprintf(fp, "%s\n", line.buf);
+	}
+
+	fclose(fp);
+	free(sparse_filename);
+
+	clear_exclude_list(&el);
+
+	return sc_read_tree();
+}
+
 int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 {
 	static struct option builtin_sparse_checkout_options[] = {
@@ -187,6 +215,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 			return sparse_checkout_list(argc, argv);
 		if (!strcmp(argv[0], "init"))
 			return sparse_checkout_init(argc, argv);
+		if (!strcmp(argv[0], "add"))
+			return sparse_checkout_add(argc, argv);
 	}
 
 	usage_with_options(builtin_sparse_checkout_usage,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index b7d5f15830..499bd8d6d0 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -100,4 +100,24 @@ test_expect_success 'clone --sparse' '
 	test_cmp expect dir
 '
 
+test_expect_success 'add to existing sparse-checkout' '
+	echo "/folder2/*" | git -C repo sparse-checkout add &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/*
+		/folder1/*
+		/folder2/*
+	EOF
+	git -C repo sparse-checkout list >actual &&
+	test_cmp expect actual &&
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_done
\ No newline at end of file
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH 6/9] trace2:experiment: clear_ce_flags_1
  2019-08-20 15:11 [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode Derrick Stolee via GitGitGadget
                   ` (3 preceding siblings ...)
  2019-08-20 15:11 ` [PATCH 4/9] sparse-checkout: 'add' subcommand Derrick Stolee via GitGitGadget
@ 2019-08-20 15:11 ` Jeff Hostetler via GitGitGadget
  2019-08-24  0:08   ` Elijah Newren
  2019-08-20 15:11 ` [PATCH 5/9] sparse-checkout: create 'disable' subcommand Derrick Stolee via GitGitGadget
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 196+ messages in thread
From: Jeff Hostetler via GitGitGadget @ 2019-08-20 15:11 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

The clear_ce_flags_1 method is used by many types of calls to
unpack_trees(). Add trace2 regions around the method, including
some flag information, so we can get granular performance data
during experiments.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 unpack-trees.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/unpack-trees.c b/unpack-trees.c
index 62276d4fef..8c3b5e8849 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1397,15 +1397,23 @@ static int clear_ce_flags(struct index_state *istate,
 			  struct exclude_list *el)
 {
 	static struct strbuf prefix = STRBUF_INIT;
+	char label[100];
+	int rval;
 
 	strbuf_reset(&prefix);
 
-	return clear_ce_flags_1(istate,
+	xsnprintf(label, sizeof(label), "clear_ce_flags(0x%08lx,0x%08lx)",
+		  (unsigned long)select_mask, (unsigned long)clear_mask);
+	trace2_region_enter("exp", label, the_repository);
+	rval = clear_ce_flags_1(istate,
 				istate->cache,
 				istate->cache_nr,
 				&prefix,
 				select_mask, clear_mask,
 				el, 0);
+	trace2_region_leave("exp", label, the_repository);
+
+	return rval;
 }
 
 /*
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH 5/9] sparse-checkout: create 'disable' subcommand
  2019-08-20 15:11 [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode Derrick Stolee via GitGitGadget
                   ` (4 preceding siblings ...)
  2019-08-20 15:11 ` [PATCH 6/9] trace2:experiment: clear_ce_flags_1 Jeff Hostetler via GitGitGadget
@ 2019-08-20 15:11 ` Derrick Stolee via GitGitGadget
  2019-08-23 23:50   ` Elijah Newren
  2019-08-20 15:11 ` [PATCH 7/9] sparse-checkout: add 'cone' mode Derrick Stolee via GitGitGadget
                   ` (4 subsequent siblings)
  10 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-08-20 15:11 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The instructions for disabling a sparse-checkout to a full
working directory are complicated and non-intuitive. Add a
subcommand, 'git sparse-checkout disable', to perform those
steps for the user.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-sparse-checkout.txt | 26 +++++++---------
 builtin/sparse-checkout.c             | 45 ++++++++++++++++++++++++---
 t/t1091-sparse-checkout-builtin.sh    | 15 +++++++++
 3 files changed, 67 insertions(+), 19 deletions(-)

diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index 6f540a3443..de04b768ae 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -38,6 +38,10 @@ COMMANDS
 	Add a set of patterns to the sparse-checkout file, as given over
 	stdin. Updates the working directory to match the new patterns.
 
+'disable'::
+	Remove the sparse-checkout file, set `core.sparseCheckout` to
+	`false`, and restore the working directory to include all files.
+
 SPARSE CHECKOUT
 ----------------
 
@@ -60,6 +64,13 @@ Then it compares the new skip-worktree value with the previous one. If
 skip-worktree turns from set to unset, it will add the corresponding
 file back. If it turns from unset to set, that file will be removed.
 
+To repopulate the working directory with all files, use the
+`git sparse-checkout disable` command.
+
+Sparse checkout support in 'git read-tree' and similar commands is
+disabled by default. You need to set `core.sparseCheckout` to `true`
+in order to have sparse checkout support.
+
 ## FULL PATTERN SET
 
 By default, the sparse-checkout file uses the same syntax as `.gitignore`
@@ -74,21 +85,6 @@ negate patterns. For example, to remove the file `unwanted`:
 !unwanted
 ----------------
 
-Another tricky thing is fully repopulating the working directory when you
-no longer want sparse checkout. You cannot just disable "sparse
-checkout" because skip-worktree bits are still in the index and your working
-directory is still sparsely populated. You should re-populate the working
-directory with the `$GIT_DIR/info/sparse-checkout` file content as
-follows:
-
-----------------
-/*
-----------------
-
-Then you can disable sparse checkout. Sparse checkout support in 'git
-read-tree' and similar commands is disabled by default. You need to
-set `core.sparseCheckout` to `true` in order to have sparse checkout
-support.
 
 SEE ALSO
 --------
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index ec6134fecc..8f97c27ec7 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -8,7 +8,7 @@
 #include "strbuf.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
-	N_("git sparse-checkout [init|add|list]"),
+	N_("git sparse-checkout [init|add|list|disable]"),
 	NULL
 };
 
@@ -79,11 +79,24 @@ static int sc_read_tree(void)
 	return result;
 }
 
-static int sc_enable_config(void)
+static int sc_set_config(int mode)
 {
 	struct argv_array argv = ARGV_ARRAY_INIT;
 	int result = 0;
-	argv_array_pushl(&argv, "config", "--add", "core.sparseCheckout", "true", NULL);
+	argv_array_pushl(&argv, "config", "--add", "core.sparseCheckout", NULL);
+
+	switch (mode) {
+	case 1:
+		argv_array_pushl(&argv, "true", NULL);
+		break;
+
+	case 0:
+		argv_array_pushl(&argv, "false", NULL);
+		break;
+
+	default:
+		die(_("invalid config mode"));
+	}
 
 	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
 		error(_("failed to enable core.sparseCheckout"));
@@ -125,7 +138,7 @@ static int sparse_checkout_init(int argc, const char **argv)
 	FILE *fp;
 	int res;
 
-	if (sc_enable_config())
+	if (sc_set_config(1))
 		return 1;
 
 	memset(&el, 0, sizeof(el));
@@ -194,6 +207,28 @@ static int sparse_checkout_add(int argc, const char **argv)
 	return sc_read_tree();
 }
 
+static int sparse_checkout_disable(int argc, const char **argv)
+{
+	char *sparse_filename;
+	FILE *fp;
+
+	if (sc_set_config(1))
+		die(_("failed to change config"));
+
+	sparse_filename = get_sparse_checkout_filename();
+	fp = fopen(sparse_filename, "w");
+	fprintf(fp, "/*\n");
+	fclose(fp);
+
+	if (sc_read_tree())
+		die(_("error while refreshing working directory"));
+
+	unlink(sparse_filename);
+	free(sparse_filename);
+
+	return sc_set_config(0);
+}
+
 int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 {
 	static struct option builtin_sparse_checkout_options[] = {
@@ -217,6 +252,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 			return sparse_checkout_init(argc, argv);
 		if (!strcmp(argv[0], "add"))
 			return sparse_checkout_add(argc, argv);
+		if (!strcmp(argv[0], "disable"))
+			return sparse_checkout_disable(argc, argv);
 	}
 
 	usage_with_options(builtin_sparse_checkout_usage,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 499bd8d6d0..68ca63a6f6 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -120,4 +120,19 @@ test_expect_success 'add to existing sparse-checkout' '
 	test_cmp expect dir
 '
 
+test_expect_success 'sparse-checkout disable' '
+	git -C repo sparse-checkout disable &&
+	test_path_is_missing repo/.git/info/sparse-checkout &&
+	git -C repo config --list >config &&
+	test_i18ngrep "core.sparsecheckout=false" config &&
+	ls repo >dir &&
+	cat >expect <<-EOF &&
+		a
+		deep
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_done
\ No newline at end of file
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH 7/9] sparse-checkout: add 'cone' mode
  2019-08-20 15:11 [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode Derrick Stolee via GitGitGadget
                   ` (5 preceding siblings ...)
  2019-08-20 15:11 ` [PATCH 5/9] sparse-checkout: create 'disable' subcommand Derrick Stolee via GitGitGadget
@ 2019-08-20 15:11 ` Derrick Stolee via GitGitGadget
  2019-08-24  0:31   ` Elijah Newren
  2019-08-20 15:11 ` [PATCH 8/9] sparse-checkout: use hashmaps for cone patterns Derrick Stolee via GitGitGadget
                   ` (3 subsequent siblings)
  10 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-08-20 15:11 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The sparse-checkout feature can have quadratic performance as
the number of patterns and number of entries in the index grow.
If there are 1,000 patterns and 1,000,000 entries, this time can
be very significant.

Create a new 'cone' mode for the core.sparseCheckout config
option, and adjust the parser to set an appropriate enum value.

While adjusting the type of this variable, rename it from
core_apply_sparse_checkout to core_sparse_checkout. This will
help avoid parallel changes from hitting type issues, and we
can guarantee that all uses now consider the enum values instead
of the int value.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/config/core.txt         |  7 ++--
 Documentation/git-sparse-checkout.txt | 50 +++++++++++++++++++++++++++
 builtin/clone.c                       |  2 +-
 builtin/sparse-checkout.c             | 16 +++++----
 cache.h                               |  8 ++++-
 config.c                              | 10 +++++-
 environment.c                         |  2 +-
 t/t1091-sparse-checkout-builtin.sh    | 14 ++++++++
 unpack-trees.c                        |  2 +-
 9 files changed, 98 insertions(+), 13 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index 75538d27e7..9b8ab2a6d4 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -591,8 +591,11 @@ core.multiPackIndex::
 	multi-pack-index design document].
 
 core.sparseCheckout::
-	Enable "sparse checkout" feature. See section "Sparse checkout" in
-	linkgit:git-read-tree[1] for more information.
+	Enable "sparse checkout" feature. If "false", then sparse-checkout
+	is disabled. If "true", then sparse-checkout is enabled with the full
+	.gitignore pattern set. If "cone", then sparse-checkout is enabled with
+	a restricted pattern set. See linkgit:git-sparse-checkout[1] for more
+	information.
 
 core.abbrev::
 	Set the length object names are abbreviated to.  If
diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index de04b768ae..463319055b 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -86,6 +86,56 @@ negate patterns. For example, to remove the file `unwanted`:
 ----------------
 
 
+## CONE PATTERN SET
+
+The full pattern set allows for arbitrary pattern matches and complicated
+inclusion/exclusion rules. These can result in O(N*M) pattern matches when
+updating the index, where N is the number of patterns and M is the number
+of paths in the index. To combat this performance issue, a more restricted
+pattern set is allowed when `core.spareCheckout` is set to `cone`.
+
+The accepted patterns in the cone pattern set are:
+
+1. *Recursive:* All paths inside a directory are included.
+
+2. *Parent:* All files immediately inside a directory are included.
+
+In addition to the above two patterns, we also expect that all files in the
+root directory are included. If a recursive pattern is added, then all
+leading directories are added as parent patterns.
+
+By default, when running `git sparse-checkout init`, the root directory is
+added as a parent pattern. At this point, the sparse-checkout file contains
+the following patterns:
+
+```
+/*
+!/*/*
+```
+
+This says "include everything in root, but nothing two levels below root."
+If we then add the folder `A/B/C` as a recursive pattern, the folders `A` and
+`A/B` are added as parent patterns. The resulting sparse-checkout file is
+now
+
+```
+/*
+!/*/*
+/A/*
+!/A/*/*
+/A/B/*
+!/A/B/*/*
+/A/B/C/*
+```
+
+Here, order matters, so the negative patterns are overridden by the positive
+patterns that appear lower in the file.
+
+If `core.sparseCheckout=cone`, then Git will parse the sparse-checkout file
+expecting patterns of these types. Git will warn if the patterns do not match.
+If the patterns do match the expected format, then Git will use faster hash-
+based algorithms to compute inclusion in the sparse-checkout.
+
 SEE ALSO
 --------
 
diff --git a/builtin/clone.c b/builtin/clone.c
index d6d49a73ff..763898ada5 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -747,7 +747,7 @@ static int git_sparse_checkout_init(const char *repo)
 	 * We must apply the setting in the current process
 	 * for the later checkout to use the sparse-checkout file.
 	 */
-	core_apply_sparse_checkout = 1;
+	core_sparse_checkout = SPARSE_CHECKOUT_FULL;
 
 	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
 		error(_("failed to initialize sparse-checkout"));
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 8f97c27ec7..77e5235720 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -79,18 +79,22 @@ static int sc_read_tree(void)
 	return result;
 }
 
-static int sc_set_config(int mode)
+static int sc_set_config(enum sparse_checkout_mode mode)
 {
 	struct argv_array argv = ARGV_ARRAY_INIT;
 	int result = 0;
 	argv_array_pushl(&argv, "config", "--add", "core.sparseCheckout", NULL);
 
 	switch (mode) {
-	case 1:
+	case SPARSE_CHECKOUT_FULL:
 		argv_array_pushl(&argv, "true", NULL);
 		break;
 
-	case 0:
+	case SPARSE_CHECKOUT_CONE:
+		argv_array_pushl(&argv, "cone", NULL);
+		break;
+
+	case SPARSE_CHECKOUT_NONE:
 		argv_array_pushl(&argv, "false", NULL);
 		break;
 
@@ -138,7 +142,7 @@ static int sparse_checkout_init(int argc, const char **argv)
 	FILE *fp;
 	int res;
 
-	if (sc_set_config(1))
+	if (sc_set_config(SPARSE_CHECKOUT_FULL))
 		return 1;
 
 	memset(&el, 0, sizeof(el));
@@ -212,7 +216,7 @@ static int sparse_checkout_disable(int argc, const char **argv)
 	char *sparse_filename;
 	FILE *fp;
 
-	if (sc_set_config(1))
+	if (sc_set_config(SPARSE_CHECKOUT_FULL))
 		die(_("failed to change config"));
 
 	sparse_filename = get_sparse_checkout_filename();
@@ -226,7 +230,7 @@ static int sparse_checkout_disable(int argc, const char **argv)
 	unlink(sparse_filename);
 	free(sparse_filename);
 
-	return sc_set_config(0);
+	return sc_set_config(SPARSE_CHECKOUT_NONE);
 }
 
 int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
diff --git a/cache.h b/cache.h
index b1da1ab08f..4426816ca1 100644
--- a/cache.h
+++ b/cache.h
@@ -865,12 +865,18 @@ extern char *git_replace_ref_base;
 
 extern int fsync_object_files;
 extern int core_preload_index;
-extern int core_apply_sparse_checkout;
 extern int precomposed_unicode;
 extern int protect_hfs;
 extern int protect_ntfs;
 extern const char *core_fsmonitor;
 
+enum sparse_checkout_mode {
+	SPARSE_CHECKOUT_NONE = 0,
+	SPARSE_CHECKOUT_FULL = 1,
+	SPARSE_CHECKOUT_CONE = 2,
+};
+enum sparse_checkout_mode core_sparse_checkout;
+
 /*
  * Include broken refs in all ref iterations, which will
  * generally choke dangerous operations rather than letting
diff --git a/config.c b/config.c
index 3900e4947b..15b7a20dd9 100644
--- a/config.c
+++ b/config.c
@@ -1360,7 +1360,15 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
 	}
 
 	if (!strcmp(var, "core.sparsecheckout")) {
-		core_apply_sparse_checkout = git_config_bool(var, value);
+		int result = git_parse_maybe_bool(value);
+
+		if (result < 0) {
+			core_sparse_checkout = SPARSE_CHECKOUT_NONE;
+
+			if (!strcasecmp(value, "cone"))
+				core_sparse_checkout = SPARSE_CHECKOUT_CONE;
+		} else
+			core_sparse_checkout = result;
 		return 0;
 	}
 
diff --git a/environment.c b/environment.c
index 89af47cb85..cc12e30bd6 100644
--- a/environment.c
+++ b/environment.c
@@ -68,7 +68,7 @@ enum push_default_type push_default = PUSH_DEFAULT_UNSPECIFIED;
 enum object_creation_mode object_creation_mode = OBJECT_CREATION_MODE;
 char *notes_ref_name;
 int grafts_replace_parents = 1;
-int core_apply_sparse_checkout;
+enum sparse_checkout_mode core_sparse_checkout;
 int merge_log_config = -1;
 int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */
 unsigned long pack_size_limit_cfg;
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 68ca63a6f6..8cc377b839 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -120,6 +120,20 @@ test_expect_success 'add to existing sparse-checkout' '
 	test_cmp expect dir
 '
 
+test_expect_success 'cone mode: match patterns' '
+	git -C repo config --replace-all core.sparseCheckout cone &&
+	rm -rf repo/a repo/folder1 repo/folder2 &&
+	git -C repo read-tree -mu HEAD &&
+	git -C repo reset --hard &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_expect_success 'sparse-checkout disable' '
 	git -C repo sparse-checkout disable &&
 	test_path_is_missing repo/.git/info/sparse-checkout &&
diff --git a/unpack-trees.c b/unpack-trees.c
index 8c3b5e8849..289c62305f 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1468,7 +1468,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 
 	trace_performance_enter();
 	memset(&el, 0, sizeof(el));
-	if (!core_apply_sparse_checkout || !o->update)
+	if (!core_sparse_checkout || !o->update)
 		o->skip_sparse_checkout = 1;
 	if (!o->skip_sparse_checkout) {
 		char *sparse = git_pathdup("info/sparse-checkout");
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH 8/9] sparse-checkout: use hashmaps for cone patterns
  2019-08-20 15:11 [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode Derrick Stolee via GitGitGadget
                   ` (6 preceding siblings ...)
  2019-08-20 15:11 ` [PATCH 7/9] sparse-checkout: add 'cone' mode Derrick Stolee via GitGitGadget
@ 2019-08-20 15:11 ` Derrick Stolee via GitGitGadget
  2019-08-24  4:56   ` Elijah Newren
  2019-08-20 15:11 ` [PATCH 9/9] sparse-checkout: init and add in cone mode Derrick Stolee via GitGitGadget
                   ` (2 subsequent siblings)
  10 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-08-20 15:11 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The parent and recursive patterns allowed by the "cone mode"
option in sparse-checkout are restrictive enough that we
can avoid using the regex parsing. Everything is based on
prefix matches, so we can use hashsets to store the prefixes
from the sparse-checkout file. When checking a path, we can
strip path entries from the path and check the hashset for
an exact match.

As a test, I created a cone-mode sparse-checkout file for the
Linux repository that actually includes every file. This was
constructed by taking every folder in the Linux repo and creating
the pattern pairs here:

	/$folder/*
	!/$folder/*/*

This resulted in a sparse-checkout file sith 8,296 patterns.
Running 'git read-tree -mu HEAD' on this file had the following
performance:

	core.sparseCheckout=false: 0.21 s (0.00 s)
	 core.sparseCheckout=true: 3.75 s (3.50 s)
	 core.sparseCheckout=cone: 0.23 s (0.01 s)

The times in parentheses above correspond to the time spent
in the first clear_ce_flags() call, according to the trace2
performance traces.

While this example is contrived, it demonstrates how these
patterns can slow the sparse-checkout feature.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-sparse-checkout.txt |   1 -
 dir.c                                 | 154 +++++++++++++++++++++++++-
 dir.h                                 |  27 +++++
 t/t1091-sparse-checkout-builtin.sh    |   8 ++
 4 files changed, 183 insertions(+), 7 deletions(-)

diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index 463319055b..7ade827370 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -85,7 +85,6 @@ negate patterns. For example, to remove the file `unwanted`:
 !unwanted
 ----------------
 
-
 ## CONE PATTERN SET
 
 The full pattern set allows for arbitrary pattern matches and complicated
diff --git a/dir.c b/dir.c
index d021c908e5..2c5ff89a72 100644
--- a/dir.c
+++ b/dir.c
@@ -599,6 +599,99 @@ void parse_exclude_pattern(const char **pattern,
 	*patternlen = len;
 }
 
+static int el_hashmap_cmp(const void *unused_cmp_data,
+			  const void *a, const void *b, const void *key)
+{
+	const struct exclude_entry *ee1 = a;
+	const struct exclude_entry *ee2 = b;
+
+	return strncmp(ee1->pattern, ee2->pattern, ee1->patternlen);
+}
+
+static void add_exclude_to_hashsets(struct exclude_list *el, struct exclude *x)
+{
+	struct exclude_entry *e;
+	char *truncated;
+	char *data = NULL;
+
+	if (!el->use_cone_patterns)
+		return;
+
+	if (x->patternlen >= 4 &&
+	    !strcmp(x->pattern + x->patternlen - 4, "/*/*")) {
+		if (!(x->flags & EXC_FLAG_NEGATIVE)) {
+			/* Not a cone pattern. */
+			el->use_cone_patterns = 0;
+			warning(_("unrecognized pattern: '%s'"), x->pattern);
+			goto clear_hashmaps;
+		}
+
+		truncated = xstrdup(x->pattern);
+		truncated[x->patternlen - 4] = 0;
+
+		e = xmalloc(sizeof(struct exclude_entry));
+		e->pattern = truncated;
+		e->patternlen = x->patternlen - 4;
+		hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
+
+		if (!hashmap_get(&el->recursive_hashmap, e, NULL)) {
+			/* We did not see the "parent" included */
+			warning(_("unrecognized negative pattern: '%s'"), x->pattern);
+			free(truncated);
+			goto clear_hashmaps;
+		}
+
+		hashmap_add(&el->parent_hashmap, e);
+		hashmap_remove(&el->recursive_hashmap, e, &data);
+		free(data);
+		return;
+	}
+
+	if (x->patternlen >= 2 &&
+	    !strcmp(x->pattern + x->patternlen - 2, "/*")) {
+		if (x->flags & EXC_FLAG_NEGATIVE) {
+			warning(_("unrecognized negative pattern: '%s'"), x->pattern);
+			goto clear_hashmaps;
+		}
+
+		e = xmalloc(sizeof(struct exclude_entry));
+
+		truncated = xstrdup(x->pattern);
+		truncated[x->patternlen - 2] = 0;
+		e->pattern = truncated;
+		e->patternlen = x->patternlen - 2;
+		hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
+
+		hashmap_add(&el->recursive_hashmap, e);
+
+		if (hashmap_get(&el->parent_hashmap, e, NULL)) {
+			/* we already included this at the parent level */
+			warning(_("your sparse-checkout file may have issues: pattern '%s' is repeated"),
+				x->pattern);
+			hashmap_remove(&el->parent_hashmap, e, &data);
+			free(data);
+		}
+		return;
+	}
+
+clear_hashmaps:
+	hashmap_free(&el->parent_hashmap, 1);
+	hashmap_free(&el->recursive_hashmap, 1);
+	el->use_cone_patterns = 0;
+}
+
+static int hashmap_contains_path(struct hashmap *map,
+				 struct strbuf *pattern)
+{
+	struct exclude_entry e;
+
+	/* Check straight mapping */
+	e.pattern = pattern->buf;
+	e.patternlen = pattern->len;
+	hashmap_entry_init(&e, memhash(e.pattern, e.patternlen));
+	return !!hashmap_get(map, &e, NULL);
+}
+
 void add_exclude(const char *string, const char *base,
 		 int baselen, struct exclude_list *el, int srcpos)
 {
@@ -623,6 +716,8 @@ void add_exclude(const char *string, const char *base,
 	ALLOC_GROW(el->excludes, el->nr + 1, el->alloc);
 	el->excludes[el->nr++] = x;
 	x->el = el;
+
+	add_exclude_to_hashsets(el, x);
 }
 
 static int read_skip_worktree_file_from_index(const struct index_state *istate,
@@ -848,6 +943,10 @@ static int add_excludes_from_buffer(char *buf, size_t size,
 	int i, lineno = 1;
 	char *entry;
 
+	el->use_cone_patterns = core_sparse_checkout == SPARSE_CHECKOUT_CONE ? 1 : 0;
+	hashmap_init(&el->recursive_hashmap, el_hashmap_cmp, NULL, 0);
+	hashmap_init(&el->parent_hashmap, el_hashmap_cmp, NULL, 0);
+
 	el->filebuf = buf;
 
 	if (skip_utf8_bom(&buf, size))
@@ -1070,18 +1169,61 @@ static struct exclude *last_exclude_matching_from_list(const char *pathname,
 
 /*
  * Scan the list and let the last match determine the fate.
- * Return 1 for exclude, 0 for include and -1 for undecided.
+ * Return 0 for exclude, 1 for include and -1 for undecided.
  */
 int is_excluded_from_list(const char *pathname,
 			  int pathlen, const char *basename, int *dtype,
 			  struct exclude_list *el, struct index_state *istate)
 {
 	struct exclude *exclude;
-	exclude = last_exclude_matching_from_list(pathname, pathlen, basename,
-						  dtype, el, istate);
-	if (exclude)
-		return exclude->flags & EXC_FLAG_NEGATIVE ? 0 : 1;
-	return -1; /* undecided */
+	struct strbuf parent_pathname = STRBUF_INIT;
+	int result = 0;
+	const char *slash_pos;
+
+	if (!el->use_cone_patterns) {
+		exclude = last_exclude_matching_from_list(pathname, pathlen, basename,
+								dtype, el, istate);
+
+		if (exclude)
+			return exclude->flags & EXC_FLAG_NEGATIVE ? 0 : 1;
+
+		return -1; /* undecided */
+	}
+
+	strbuf_addch(&parent_pathname, '/');
+	strbuf_add(&parent_pathname, pathname, pathlen);
+	slash_pos = strrchr(parent_pathname.buf, '/');
+
+	if (slash_pos == parent_pathname.buf) {
+		/* include every file in root */
+		result = 1;
+		goto done;
+	}
+
+	strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf);
+
+	if (hashmap_contains_path(&el->parent_hashmap, &parent_pathname)) {
+		result = 1;
+		goto done;
+	}
+
+	while (parent_pathname.len) {
+		if (hashmap_contains_path(&el->recursive_hashmap,
+					  &parent_pathname)) {
+			result = -1;
+			goto done;
+		}
+
+		slash_pos = strrchr(parent_pathname.buf, '/');
+		if (slash_pos == parent_pathname.buf)
+			break;
+
+		strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf);
+	}
+
+done:
+	strbuf_release(&parent_pathname);
+	return result;
 }
 
 static struct exclude *last_exclude_matching_from_lists(struct dir_struct *dir,
diff --git a/dir.h b/dir.h
index 680079bbe3..2d3356d1c0 100644
--- a/dir.h
+++ b/dir.h
@@ -4,6 +4,7 @@
 /* See Documentation/technical/api-directory-listing.txt */
 
 #include "cache.h"
+#include "hashmap.h"
 #include "strbuf.h"
 
 struct dir_entry {
@@ -37,6 +38,13 @@ struct exclude {
 	int srcpos;
 };
 
+/* used for hashmaps for cone patterns */
+struct exclude_entry {
+	struct hashmap_entry ent;
+	char *pattern;
+	size_t patternlen;
+};
+
 /*
  * Each excludes file will be parsed into a fresh exclude_list which
  * is appended to the relevant exclude_list_group (either EXC_DIRS or
@@ -55,6 +63,25 @@ struct exclude_list {
 	const char *src;
 
 	struct exclude **excludes;
+
+	/*
+	 * While scanning the excludes, we attempt to match the patterns
+	 * with a more restricted set that allows us to use hashsets for
+	 * matching logic, which is faster than the linear lookup in the
+	 * excludes array above. If non-zero, that check succeeded.
+	 */
+	unsigned use_cone_patterns;
+
+	/*
+	 * Stores paths where everything starting with those paths
+	 * is included.
+	 */
+	struct hashmap recursive_hashmap;
+
+	/*
+	 * Used to check single-level parents of blobs.
+	 */
+	struct hashmap parent_hashmap;
 };
 
 /*
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 8cc377b839..60f10864a1 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -134,6 +134,14 @@ test_expect_success 'cone mode: match patterns' '
 	test_cmp expect dir
 '
 
+test_expect_success 'cone mode: warn on bad pattern' '
+	test_when_finished mv sparse-checkout repo/.git/info &&
+	cp repo/.git/info/sparse-checkout . &&
+	echo "!/deep/deeper/*" >>repo/.git/info/sparse-checkout &&
+	git -C repo read-tree -mu HEAD 2>err &&
+	test_i18ngrep "unrecognized negative pattern" err
+'
+
 test_expect_success 'sparse-checkout disable' '
 	git -C repo sparse-checkout disable &&
 	test_path_is_missing repo/.git/info/sparse-checkout &&
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH 9/9] sparse-checkout: init and add in cone mode
  2019-08-20 15:11 [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode Derrick Stolee via GitGitGadget
                   ` (7 preceding siblings ...)
  2019-08-20 15:11 ` [PATCH 8/9] sparse-checkout: use hashmaps for cone patterns Derrick Stolee via GitGitGadget
@ 2019-08-20 15:11 ` Derrick Stolee via GitGitGadget
  2019-08-24  5:07   ` Elijah Newren
  2019-08-21 21:52 ` [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode Elijah Newren
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
  10 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-08-20 15:11 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

To make the cone pattern set easy to use, update the behavior of
'git sparse-checkout [init|add]'.

Add '--cone' flag to 'git sparse-checkout init' to set the config
option 'core.sparseCheckout=cone'.

When running 'git sparse-checkout add' in cone mode, a user only
needs to supply a list of recursive folder matches. Git will
automatically add the necessary parent matches for the leading
directories.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 builtin/sparse-checkout.c          | 134 +++++++++++++++++++++++++++--
 t/t1091-sparse-checkout-builtin.sh |  35 ++++++++
 2 files changed, 164 insertions(+), 5 deletions(-)

diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 77e5235720..0a4e101ddd 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -6,15 +6,22 @@
 #include "repository.h"
 #include "run-command.h"
 #include "strbuf.h"
+#include "string-list.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
 	N_("git sparse-checkout [init|add|list|disable]"),
 	NULL
 };
 
+static const char * const builtin_sparse_checkout_init_usage[] = {
+	N_("git sparse-checkout init [--cone]"),
+	NULL
+};
+
 struct opts_sparse_checkout {
 	const char *subcommand;
 	int read_stdin;
+	int cone;
 } opts;
 
 static char *get_sparse_checkout_filename(void)
@@ -41,6 +48,60 @@ static void write_excludes_to_file(FILE *fp, struct exclude_list *el)
 	}
 }
 
+static void write_cone_to_file(FILE *fp, struct exclude_list *el)
+{
+	int i;
+	struct exclude_entry *entry;
+	struct hashmap_iter iter;
+	struct string_list sl = STRING_LIST_INIT_DUP;
+
+	hashmap_iter_init(&el->parent_hashmap, &iter);
+	while ((entry = hashmap_iter_next(&iter))) {
+		char *pattern = xstrdup(entry->pattern);
+		char *converted = pattern;
+		if (pattern[0] == '/')
+			converted++;
+		if (pattern[entry->patternlen - 1] == '/')
+			pattern[entry->patternlen - 1] = 0;
+		string_list_insert(&sl, converted);
+		free(pattern);
+	}
+
+	string_list_sort(&sl);
+	string_list_remove_duplicates(&sl, 0);
+
+	for (i = 0; i < sl.nr; i++) {
+		char *pattern = sl.items[i].string;
+
+		if (!strcmp(pattern, ""))
+			fprintf(fp, "/*\n!/*/*\n");
+		else
+			fprintf(fp, "/%s/*\n!/%s/*/*\n", pattern, pattern);
+	}
+
+	string_list_clear(&sl, 0);
+
+	hashmap_iter_init(&el->recursive_hashmap, &iter);
+	while ((entry = hashmap_iter_next(&iter))) {
+		char *pattern = xstrdup(entry->pattern);
+		char *converted = pattern;
+		if (pattern[0] == '/')
+			converted++;
+		if (pattern[entry->patternlen - 1] == '/')
+			pattern[entry->patternlen - 1] = 0;
+		string_list_insert(&sl, converted);
+		free(pattern);
+	}
+
+	string_list_sort(&sl);
+	string_list_remove_duplicates(&sl, 0);
+
+	for (i = 0; i < sl.nr; i++) {
+		char *pattern = sl.items[i].string;
+		fprintf(fp, "/%s/*\n", pattern);
+	}
+}
+
 static int sparse_checkout_list(int argc, const char **argv)
 {
 	struct exclude_list el;
@@ -141,8 +202,21 @@ static int sparse_checkout_init(int argc, const char **argv)
 	char *sparse_filename;
 	FILE *fp;
 	int res;
+	enum sparse_checkout_mode mode;
 
-	if (sc_set_config(SPARSE_CHECKOUT_FULL))
+	static struct option builtin_sparse_checkout_init_options[] = {
+		OPT_BOOL(0, "cone", &opts.cone,
+			 N_("initialize the sparse-checkout in cone mode")),
+		OPT_END(),
+	};
+
+	argc = parse_options(argc, argv, NULL,
+			     builtin_sparse_checkout_init_options,
+			     builtin_sparse_checkout_init_usage, 0);
+
+	mode = opts.cone ? SPARSE_CHECKOUT_CONE : SPARSE_CHECKOUT_FULL;
+
+	if (sc_set_config(mode))
 		return 1;
 
 	memset(&el, 0, sizeof(el));
@@ -183,6 +257,34 @@ static int sparse_checkout_init(int argc, const char **argv)
 	return sc_read_tree();
 }
 
+static void insert_recursive_pattern(struct exclude_list *el, struct strbuf *path)
+{
+	struct exclude_entry *e = xmalloc(sizeof(struct exclude_entry));
+	e->patternlen = path->len;
+	e->pattern = strbuf_detach(path, NULL);
+	hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
+
+	hashmap_add(&el->recursive_hashmap, e);
+
+	while (e->patternlen) {
+		char *slash = strrchr(e->pattern, '/');
+		char *oldpattern = e->pattern;
+		size_t newlen;
+
+		if (!slash)
+			break;
+
+		newlen = slash - e->pattern;
+		e = xmalloc(sizeof(struct exclude_entry));
+		e->patternlen = newlen;
+		e->pattern = xstrndup(oldpattern, newlen);
+		hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
+
+		if (!hashmap_get(&el->parent_hashmap, e, NULL))
+			hashmap_add(&el->parent_hashmap, e);
+	}
+}
+
 static int sparse_checkout_add(int argc, const char **argv)
 {
 	struct exclude_list el;
@@ -196,11 +298,33 @@ static int sparse_checkout_add(int argc, const char **argv)
 	add_excludes_from_file_to_list(sparse_filename, "", 0, &el, NULL);
 
 	fp = fopen(sparse_filename, "w");
-	write_excludes_to_file(fp, &el);
 
-	while (!strbuf_getline(&line, stdin)) {
-		strbuf_trim(&line);
-		fprintf(fp, "%s\n", line.buf);
+	if (core_sparse_checkout == SPARSE_CHECKOUT_FULL) {
+		write_excludes_to_file(fp, &el);
+
+		while (!strbuf_getline(&line, stdin)) {
+			strbuf_trim(&line);
+			fprintf(fp, "%s\n", line.buf);
+		}
+	} else if (core_sparse_checkout == SPARSE_CHECKOUT_CONE) {
+		while (!strbuf_getline(&line, stdin)) {
+			strbuf_trim(&line);
+
+			strbuf_trim_trailing_dir_sep(&line);
+
+			if (!line.len)
+				continue;
+
+			if (line.buf[0] == '/')
+				strbuf_remove(&line, 0, 1);
+
+			if (!line.len)
+				continue;
+
+			insert_recursive_pattern(&el, &line);
+		}
+
+		write_cone_to_file(fp, &el);
 	}
 
 	fclose(fp);
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 60f10864a1..3412bafdff 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -157,4 +157,39 @@ test_expect_success 'sparse-checkout disable' '
 	test_cmp expect dir
 '
 
+test_expect_success 'cone mode: init and add' '
+	git -C repo sparse-checkout init --cone &&
+	git -C repo config --list >config &&
+	test_i18ngrep "core.sparsecheckout=cone" config &&
+	ls repo >dir  &&
+	echo a >expect &&
+	test_cmp expect dir &&
+	echo deep/deeper1/deepest | git -C repo sparse-checkout add &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		deep
+	EOF
+	ls repo/deep >dir  &&
+	cat >expect <<-EOF &&
+		a
+		deeper1
+	EOF
+	ls repo/deep/deeper1 >dir  &&
+	cat >expect <<-EOF &&
+		a
+		deepest
+	EOF
+	test_cmp expect dir &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/*
+		/deep/*
+		!/deep/*/*
+		/deep/deeper1/*
+		!/deep/deeper1/*/*
+		/deep/deeper1/deepest/*
+	EOF
+	test_cmp expect repo/.git/info/sparse-checkout
+'
 test_done
\ No newline at end of file
-- 
gitgitgadget

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode
  2019-08-20 15:11 [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode Derrick Stolee via GitGitGadget
                   ` (8 preceding siblings ...)
  2019-08-20 15:11 ` [PATCH 9/9] sparse-checkout: init and add in cone mode Derrick Stolee via GitGitGadget
@ 2019-08-21 21:52 ` Elijah Newren
  2019-08-22 13:10   ` Derrick Stolee
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
  10 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-08-21 21:52 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget; +Cc: Git Mailing List, Junio C Hamano

On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> This RFC includes a potential direction to make the sparse-checkout more
> user-friendly. While there, I also present a way to use a limited set of
> patterns to gain a significant performance boost in very large repositories.
>
> Sparse-checkout is only documented as a subsection of the read-tree docs
> [1], which makes the feature hard to discover. Users have trouble navigating
> the feature, especially at clone time [2], and have even resorted to
> creating their own helper tools [3].

Ooh, intriguing.  Count me as another person who has resorted to
making my own helper tool for others to use (specific to our internal
repository, though, as it also figures out inter-module dependencies
to allow specifying only a few modules of interest while still
checking out everything needed to build those; but it'd be nice to
need less scripting to handle the git-related bits to actually
sparsify or densify).

> This RFC attempts to solve these problems using a new builtin. Here is a
> sample workflow to give a feeling for how it can work:
>
> In an existing repo:
>
> $ git sparse-checkout init
> $ ls
> myFile1.txt myFile2.txt
>
> $ git sparse-checkout add
> /myFolder/*
> ^D
> $ ls
> myFile1.txt myFile2.txt myFolder
> $ ls myFolder
> a.c a.h
> $ git sparse-checkout disable
> $ ls
> hiddenFolder myFile1.txt myFile2.txt myFolder
>
> At clone time:
>
> $ git clone --sparse origin repo
> $ cd repo
> $ ls
> myFile1.txt myFile2.txt
> $ git sparse-checkout add
> /myFolder/*
> ^D
> $ ls
> myFile1.txt myFile2.txt myFolder
>
> Here are some more specific details:
>
>  * git sparse-checkout init enables core.sparseCheckout and populates the
>    sparse-checkout file with patterns that match only the files at root.

Does it enable core.sparseCheckout in the current worktree, or for all
worktrees?  Do we require extensions.worktreeConfig to be set to true
first?  If we don't require extensions.worktreeConfig to be set to
true, and users add worktrees later, do they encounter negative
surprises (immediately or later)?

worktrees in combination with sparseCheckouts were a headache here
until I just forced people to manually first set
extensions.worktreeConfig to true before using my 'sparsify' script,
regardless of whether the user was currently using worktrees.  That
fixed the issues, but having to provide a long error message and
explanation of why I wanted users to set some special config first was
slightly annoying.

I wonder if 'git worktree' and maybe even 'git config' should
themselves have special handling for core.sparseCheckouts, because it
can be a real mess otherwise.

>  * git clone learns the --sparse argument to run git sparse-checkout init
>    before the first checkout.

Nice.

>  * git sparse-checkout add reads patterns from stdin, one per line, then
>    adds them to the sparse-checkout file and refreshes the working
>    directory.

The default of reading from stdin seems a bit unusual to me, and I
worry about having to explain that to users.  I'd rather the add
command took positional parameters (anything that doesn't start with a
hyphen) and added those, e.g.
  $ git sparse-checkout add '/myFolder/*' '
with the option of the user specifying --stdin.

>  * git sparse-checkout disable removes the patterns from the sparse-checkout
>    file, disables core.sparseCheckout, and refills the working directory.

Does it leave an empty sparse-checkout file around?  Also, what if
users have several paths defining their sparse pattern, and want to
temporarily get a full checkout and then come back -- do they need to
re-specify all the paths?  (Maybe this *is* the route we want to go;
I'm just trying to mention any possible negative effects we _might_
run into so we can consider them.  It's not quite as relevant in my
case since people specify a few toplevel modules and sparse-checkout
gets several entries auto-generated for them.)

Also, I'm particularly worried that a user with multiple worktrees,
both sparse, could run 'git sparse-checkout disable' in one and then
find that when they return to the other worktree they get a variety of
nasty surprises (e.g. accidental staging or committing of the deletion
of a huge number of files, random weird errors, or gradual and weird
de-sparsifying as various git commands are run).  This, of course, can
be averted by making sure core.sparseCheckout is set on a per-worktree
basis, but that seems to be something people only do after running
into problems several times unless some kind of tooling enforces it.

>  * git sparse-checkout list lists the contents of the sparse-checkout file.
>
>
>
> The documentation for the sparse-checkout feature can now live primarily
> with the git-sparse-checkout documentation.

Yaay!

> Cone Mode
> =========
>
> What really got me interested in this area is a performance problem. If we
> have N patterns in the sparse-checkout file and M entries in the index, then
> we can perform up to O(N * M) pattern checks in clear_ce_flags(). This
> quadratic growth is not sustainable in a repo with 1,000+ patterns and
> 1,000,000+ index entries.

This has worried me for a while, even if it hasn't yet caused us
issues in practice.

> To solve this problem, I propose a new, more restrictive mode to
> sparse-checkout: "cone mode". In this mode, all patterns are based on prefix
> matches at a directory level. This can then use hashsets for fast
> performance -- O(M) instead of O(N*M). My hashset implementation is based on
> the virtual filesystem hook in the VFS for Git custom code [4].

Sweet!

> In cone mode, a user specifies a list of folders which the user wants every
> file inside. In addition, the cone adds all blobs that are siblings of the
> folders in the directory path to that folder. This makes the directories
> look "hydrated" as a user drills down to those recursively-closed folders.
> These directories are called "parent" folders, as a file matches them only
> if the file's immediate parent is that directory.
>
> When building a prototype of this feature, I used a separate file to contain
> the list of recursively-closed folders and built the hashsets dynamically
> based on that file. In this implementation, I tried to maximize the amount
> of backwards-compatibility by storing all data in the sparse-checkout file
> using patterns recognized by earlier Git versions.
>
> For example, if we add A/B/C as a recursive folder, then we add the
> following patterns to the sparse-checkout file:
>
> /*
> !/*/*
> /A/*
> !/A/*/*
> /A/B/*
> !/A/B/*/*
> /A/B/C/*
>
> The alternating positive/negative patterns say "include everything in this
> folder, but exclude everything another level deeper". The final pattern has
> no matching negation, so is a recursively closed pattern.

Oh, um, would there be any option for fast but without grabbing
sibling and parent files of requested directories?  And could users
still request individual files (not with regex or pathspec, but fully
specifying the path) and still get the fast mode?

Basically, our sparse usage is exclusively specifying leading
directories or full pathnames of individual files, but we really want
the repo to feel smaller and make sure people notice at a glance.  We
have a huge 'modules/' directory, and want people to be able to get
just 15 of the 500 or so subdirectories that would appear in that
directory with a non-sparse checkout.  And similarly we want to be
able to grab just one or two files from a directory of many files.

> Note that I have some basic warnings to try and check that the
> sparse-checkout file doesn't match what would be written by a cone-mode add.
> In such a case, Git writes a warning to stderr and continues with the old
> pattern matching algorithm. These checks are currently very barebones, and
> would need to be updated with more robust checks for things like regex
> characters in the middle of the pattern. As review moves forward (and if we
> don't change the data storage) then we could spend more time on this.

Instead of trying to validate the sparse-checkout file everytime,
perhaps we want to change core.sparseCheckout from a boolean to a
tri-state or something where it specifies how to parse the
sparse-checkout file?  Or maybe when special directive (some form of
comment-looking line) appears at the top of sparse-checkout then we
use the hashsets speedup while disallowing general regexes and
pathspecs other than leading directories and full pathnames?

I'm not sure if that makes sense or not; just throwing it out there as an idea.

> Derrick Stolee (8):
>   sparse-checkout: create builtin with 'list' subcommand
>   sparse-checkout: create 'init' subcommand
>   clone: add --sparse mode
>   sparse-checkout: 'add' subcommand
>   sparse-checkout: create 'disable' subcommand
>   sparse-checkout: add 'cone' mode
>   sparse-checkout: use hashmaps for cone patterns
>   sparse-checkout: init and add in cone mode
>
> Jeff Hostetler (1):
>   trace2:experiment: clear_ce_flags_1

I'll try to get some time to look over these patches in the next few days.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode
  2019-08-21 21:52 ` [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode Elijah Newren
@ 2019-08-22 13:10   ` Derrick Stolee
  2019-08-22 14:25     ` Derrick Stolee
  2019-08-24  5:40     ` Elijah Newren
  0 siblings, 2 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-08-22 13:10 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano

On 8/21/2019 5:52 PM, Elijah Newren wrote:
> On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> This RFC includes a potential direction to make the sparse-checkout more
>> user-friendly. While there, I also present a way to use a limited set of
>> patterns to gain a significant performance boost in very large repositories.
>>
>> Sparse-checkout is only documented as a subsection of the read-tree docs
>> [1], which makes the feature hard to discover. Users have trouble navigating
>> the feature, especially at clone time [2], and have even resorted to
>> creating their own helper tools [3].
> 
> Ooh, intriguing.  Count me as another person who has resorted to
> making my own helper tool for others to use (specific to our internal
> repository, though, as it also figures out inter-module dependencies
> to allow specifying only a few modules of interest while still
> checking out everything needed to build those; but it'd be nice to
> need less scripting to handle the git-related bits to actually
> sparsify or densify).
> 
>> This RFC attempts to solve these problems using a new builtin. Here is a
>> sample workflow to give a feeling for how it can work:
>>
>> In an existing repo:
>>
>> $ git sparse-checkout init
>> $ ls
>> myFile1.txt myFile2.txt
>>
>> $ git sparse-checkout add
>> /myFolder/*
>> ^D
>> $ ls
>> myFile1.txt myFile2.txt myFolder
>> $ ls myFolder
>> a.c a.h
>> $ git sparse-checkout disable
>> $ ls
>> hiddenFolder myFile1.txt myFile2.txt myFolder
>>
>> At clone time:
>>
>> $ git clone --sparse origin repo
>> $ cd repo
>> $ ls
>> myFile1.txt myFile2.txt
>> $ git sparse-checkout add
>> /myFolder/*
>> ^D
>> $ ls
>> myFile1.txt myFile2.txt myFolder
>>
>> Here are some more specific details:
>>
>>  * git sparse-checkout init enables core.sparseCheckout and populates the
>>    sparse-checkout file with patterns that match only the files at root.
> 
> Does it enable core.sparseCheckout in the current worktree, or for all
> worktrees?  Do we require extensions.worktreeConfig to be set to true
> first?  If we don't require extensions.worktreeConfig to be set to
> true, and users add worktrees later, do they encounter negative
> surprises (immediately or later)?

This is an interesting scenario that I had not considered. Thanks!

My guess is that we should set `extensions.worktreeConfig=true` to
avoid surprises. I'll need to play with this to discover the answers
to these questions:

1. Where does the worktree look for the sparse-checkout file? Does
   each worktree have its own sparse-checkout file? Should it?

2. If I have `extensions.worktreeConfig=true` and `core.sparseCheckout=true`
   in the current worktree and run `git worktree add`, does the new worktree
   have `core.sparseCheckout=true`? Can we `git clone --sparse` and then
   start building sparse worktrees seamlessly?
 
> worktrees in combination with sparseCheckouts were a headache here
> until I just forced people to manually first set
> extensions.worktreeConfig to true before using my 'sparsify' script,
> regardless of whether the user was currently using worktrees.  That
> fixed the issues, but having to provide a long error message and
> explanation of why I wanted users to set some special config first was
> slightly annoying.
> 
> I wonder if 'git worktree' and maybe even 'git config' should
> themselves have special handling for core.sparseCheckouts, because it
> can be a real mess otherwise.
> 
>>  * git clone learns the --sparse argument to run git sparse-checkout init
>>    before the first checkout.
> 
> Nice.
> 
>>  * git sparse-checkout add reads patterns from stdin, one per line, then
>>    adds them to the sparse-checkout file and refreshes the working
>>    directory.
> 
> The default of reading from stdin seems a bit unusual to me, and I
> worry about having to explain that to users.  I'd rather the add
> command took positional parameters (anything that doesn't start with a
> hyphen) and added those, e.g.
>   $ git sparse-checkout add '/myFolder/*' '
> with the option of the user specifying --stdin.

I had the same thought, and likely that's where we should go with the
builtin. For our needs, the input over stdin is more important for
testing, so I built it first. I will adjust the CLI here to take a set
of paths over the arguments unless --stdin is given.

>>  * git sparse-checkout disable removes the patterns from the sparse-checkout
>>    file, disables core.sparseCheckout, and refills the working directory.
> 
> Does it leave an empty sparse-checkout file around?  Also, what if
> users have several paths defining their sparse pattern, and want to
> temporarily get a full checkout and then come back -- do they need to
> re-specify all the paths?  (Maybe this *is* the route we want to go;
> I'm just trying to mention any possible negative effects we _might_
> run into so we can consider them.  It's not quite as relevant in my
> case since people specify a few toplevel modules and sparse-checkout
> gets several entries auto-generated for them.)

In this case, there is an intermediate step (that follows the existing
advice) to modify the sparse-checkout file to contain only "/*\n" then
run read-tree to fill the working directory before disabling the config
setting.

Perhaps "disable" is the wrong word to use, as it makes you think that
there should be an "enable" that can quickly toggle between the two
modes. Maybe instead it should be "git sparse-checkout reset [empty|full]"
where you could 'reset' the sparse-checkout to one of two initial
states:

1. empty: only files at root are included.
2. full: all files are included.

In each case, we would obliterate the existing sparse-checkout entries,
but hopefully that behavior is more clear from the command names.

> Also, I'm particularly worried that a user with multiple worktrees,
> both sparse, could run 'git sparse-checkout disable' in one and then
> find that when they return to the other worktree they get a variety of
> nasty surprises (e.g. accidental staging or committing of the deletion
> of a huge number of files, random weird errors, or gradual and weird
> de-sparsifying as various git commands are run).  This, of course, can
> be averted by making sure core.sparseCheckout is set on a per-worktree
> basis, but that seems to be something people only do after running
> into problems several times unless some kind of tooling enforces it.
> 
>>  * git sparse-checkout list lists the contents of the sparse-checkout file.
>>
>>
>>
>> The documentation for the sparse-checkout feature can now live primarily
>> with the git-sparse-checkout documentation.
> 
> Yaay!
> 
>> Cone Mode
>> =========
>>
>> What really got me interested in this area is a performance problem. If we
>> have N patterns in the sparse-checkout file and M entries in the index, then
>> we can perform up to O(N * M) pattern checks in clear_ce_flags(). This
>> quadratic growth is not sustainable in a repo with 1,000+ patterns and
>> 1,000,000+ index entries.
> 
> This has worried me for a while, even if it hasn't yet caused us
> issues in practice.
> 
>> To solve this problem, I propose a new, more restrictive mode to
>> sparse-checkout: "cone mode". In this mode, all patterns are based on prefix
>> matches at a directory level. This can then use hashsets for fast
>> performance -- O(M) instead of O(N*M). My hashset implementation is based on
>> the virtual filesystem hook in the VFS for Git custom code [4].
> 
> Sweet!
> 
>> In cone mode, a user specifies a list of folders which the user wants every
>> file inside. In addition, the cone adds all blobs that are siblings of the
>> folders in the directory path to that folder. This makes the directories
>> look "hydrated" as a user drills down to those recursively-closed folders.
>> These directories are called "parent" folders, as a file matches them only
>> if the file's immediate parent is that directory.
>>
>> When building a prototype of this feature, I used a separate file to contain
>> the list of recursively-closed folders and built the hashsets dynamically
>> based on that file. In this implementation, I tried to maximize the amount
>> of backwards-compatibility by storing all data in the sparse-checkout file
>> using patterns recognized by earlier Git versions.
>>
>> For example, if we add A/B/C as a recursive folder, then we add the
>> following patterns to the sparse-checkout file:
>>
>> /*
>> !/*/*
>> /A/*
>> !/A/*/*
>> /A/B/*
>> !/A/B/*/*
>> /A/B/C/*
>>
>> The alternating positive/negative patterns say "include everything in this
>> folder, but exclude everything another level deeper". The final pattern has
>> no matching negation, so is a recursively closed pattern.
> 
> Oh, um, would there be any option for fast but without grabbing
> sibling and parent files of requested directories?  And could users
> still request individual files (not with regex or pathspec, but fully
> specifying the path) and still get the fast mode?

Exact files could probably be included and still be fast. It requires an
extra hash check per entry, but that's a small price to pay I think.

With the sibling files, this is something I believe to be user-friendly:
as a user drills down into the folder they included recursively, there may
be helpful files along the way, like documentation, project files, etc.

Here is my philosophical position here: a repo can take advantage of the
sparse-checkout feature if it is properly componetized. Those component
boundaries are likely at folder boundaries. Any file that exists in a parent
folder for two components is likely important to _both_ components. If
a file is large and is not needed by both components, it should be placed
deeper in the tree, so it can be avoided.

With that philosophy in mind, I designed this to help users fall into the
"pit of success" when their repo is in a good shape AND to motivate users
with repos in non-optimal shapes to reorganize.

The thought I had about exact file names is similar: if there is a large
list of files in a folder where I only need a subset, then how do I know
if a new file is added that I need? It will not show up in the directory
without updating the sparse-checkout. A user would discover this need by
something going wrong when they are not interacting with version control:
a build.

This is particularly important with the root directory. We need things
like .gitignore, .gitattributes, README, LICENSE, etc. to be populated
by default. If there are too many files at root to reasonably work with
the repo, then the repo should be reorganized using folders.

> Basically, our sparse usage is exclusively specifying leading
> directories or full pathnames of individual files, but we really want
> the repo to feel smaller and make sure people notice at a glance.  We
> have a huge 'modules/' directory, and want people to be able to get
> just 15 of the 500 or so subdirectories that would appear in that
> directory with a non-sparse checkout.  And similarly we want to be
> able to grab just one or two files from a directory of many files.

Your modules/ example seems to work with the feature as designed, as
you want a set of folders one level deeper. Grabbing one or two files
from a directory is a direction we can go with the feature, but I will
continue to believe that should be a rare occurrence compared to including
a folder recursively.

>> Note that I have some basic warnings to try and check that the
>> sparse-checkout file doesn't match what would be written by a cone-mode add.
>> In such a case, Git writes a warning to stderr and continues with the old
>> pattern matching algorithm. These checks are currently very barebones, and
>> would need to be updated with more robust checks for things like regex
>> characters in the middle of the pattern. As review moves forward (and if we
>> don't change the data storage) then we could spend more time on this.
> 
> Instead of trying to validate the sparse-checkout file everytime,
> perhaps we want to change core.sparseCheckout from a boolean to a
> tri-state or something where it specifies how to parse the
> sparse-checkout file?  Or maybe when special directive (some form of
> comment-looking line) appears at the top of sparse-checkout then we
> use the hashsets speedup while disallowing general regexes and
> pathspecs other than leading directories and full pathnames?

In this series, I turn `core.sparseCheckout` into a tri-state, and only
try to validate the sparse-checkout when `core.sparseCheckout=cone`.
This avoids spending time on the validation when someone is content using
the existing feature.

The _intent_ of using the sparse-checkout file and no extra data structure
was to let other clients (or an older client) read the sparse-checkout data
and result in the same working directory. One thing I realized after
submitting is that the tri-state config variable will cause old clients
to error on parsing the non-boolean value. Instead, in v2 I will introduce
a new boolean config variable "core.sparseCheckoutCone" that will do the
same thing as the current series when `core.sparseCheckout=cone` and will
fix this compat scenario.

> I'll try to get some time to look over these patches in the next few days.

I look forward to your feedback! I also have some feedback to respond to
from my team [1], but I'm waiting to make sure the community likes the
overall idea before jumping into code style and method organization
details.

Thanks,
-Stolee

[1] https://github.com/microsoft/git/pull/180


^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode
  2019-08-22 13:10   ` Derrick Stolee
@ 2019-08-22 14:25     ` Derrick Stolee
  2019-08-24  5:40     ` Elijah Newren
  1 sibling, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-08-22 14:25 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano

On 8/22/2019 9:10 AM, Derrick Stolee wrote:
> On 8/21/2019 5:52 PM, Elijah Newren wrote:
>> On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
>>> For example, if we add A/B/C as a recursive folder, then we add the
>>> following patterns to the sparse-checkout file:
>>>
>>> /*
>>> !/*/*
>>> /A/*
>>> !/A/*/*
>>> /A/B/*
>>> !/A/B/*/*
>>> /A/B/C/*
>>>
>>> The alternating positive/negative patterns say "include everything in this
>>> folder, but exclude everything another level deeper". The final pattern has
>>> no matching negation, so is a recursively closed pattern.
>>
>> Oh, um, would there be any option for fast but without grabbing
>> sibling and parent files of requested directories?  And could users
>> still request individual files (not with regex or pathspec, but fully
>> specifying the path) and still get the fast mode?
> 
> Exact files could probably be included and still be fast. It requires an
> extra hash check per entry, but that's a small price to pay I think.

Quick side note on this point about exact file names and the REAL reason
for the parent paths that I had forgotten until just now.

The following comment exists in unpack-trees.c, clear_ce_flags_dir():

	/*
	 * TODO: check el, if there are no patterns that may conflict
	 * with ret (iow, we know in advance the incl/excl
	 * decision for the entire directory), clear flag here without
	 * calling clear_ce_flags_1(). That function will call
	 * the expensive is_excluded_from_list() on every entry.
	 */

While I haven't implemented it yet in this RFC, this TODO can actually
happen with the current set of cone patterns:

1. If we hit a directory that is not in a parent or recursive path,
   then all paths it contains must have their skipworktree bits set.
   We can avoid computing hashes for them.

2. If we hit a directory that is in a recursive path, then all paths
   it contains must have skipworktree bits off. We can avoid computing
   hashes for them.

When we have a million index entries, these hash computations are not
insignificant!

With that in mind, I think there is a _performance_ reason to include
the parent folders in addition to _user experience_ reason. If we
were to add the complexity of exact file matches, then we would also
want to add the parent folders leading to that file so we can still
do the logic above.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 1/9] sparse-checkout: create builtin with 'list' subcommand
  2019-08-20 15:11 ` [PATCH 1/9] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
@ 2019-08-23 22:30   ` Elijah Newren
  0 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-08-23 22:30 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Tue, Aug 20, 2019 at 8:13 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
<snip>
> The documentation provided is adapted from the "git read-tree"
> documentation with a few edits for clarity in the new context.
> Extra sections are added to hint toward a future change to
> a moer restricted pattern set.

s/moer/more/

<snip>
> +"Sparse checkout" allows populating the working directory sparsely.
> +It uses the skip-worktree bit (see linkgit:git-update-index[1]) to tell
> +Git whether a file in the working directory is worth looking at. If
> +the skip-worktree bit is set, then the file is ignored in the working
> +directory. Git will not populate the contents of those files, which
> +makes a sparse checkout helpful when working in a repository with many
> +files, but only a few are important to the current user.
> +
> +The `$GIT_DIR/info/sparse-checkout` file is used to define the
> +skip-worktree reference bitmap. When Git updates the working
> +directory, it resets the skip-worktree bit in the index based on this
> +file. If an entry
> +matches a pattern in this file, skip-worktree will not be set on
> +that entry. Otherwise, skip-worktree will be set.
> +
> +Then it compares the new skip-worktree value with the previous one. If
> +skip-worktree turns from set to unset, it will add the corresponding
> +file back. If it turns from unset to set, that file will be removed.

I had to read this twice for it to make sense.  Not sure I have a real
good suggestion here, the name "skip-worktree" instead of e.g
"wanted-in-worktree" just naturally leads us into sentences with one
negation automatically and we sometimes have to add more.

Maybe just replace the last two paragraphs with:

The `$GIT_DIR/info/sparse-checkout` file is used to define the
skip-worktree reference bitmap. When Git updates the working
directory, it updates the skip-worktree bits in the index based on this
file and removes or restores files in the working copy to match.

> +
> +## FULL PATTERN SET
> +
> +By default, the sparse-checkout file uses the same syntax as `.gitignore`
> +files.
> +
> +While `$GIT_DIR/info/sparse-checkout` is usually used to specify what
> +files are in, you can also specify what files are _not_ in, using
> +negate patterns. For example, to remove the file `unwanted`:

s/in/included/?

> +Another tricky thing is fully repopulating the working directory when you
> +no longer want sparse checkout. You cannot just disable "sparse
> +checkout" because skip-worktree bits are still in the index and your working
> +directory is still sparsely populated. You should re-populate the working
> +directory with the `$GIT_DIR/info/sparse-checkout` file content as
> +follows:
> +
> +----------------
> +/*
> +----------------

Can we just get rid of this part of the documentation, since there
will be a sparse-checkout command to disable/undo/reset?  However, it
could be useful to mention cases when disabling/undoing/resetting a
sparse-checkout won't work, if there are some.  For example, with the
previous read-tree implementation, you could not undo the sparse
checkout if the index had any unstaged entries, and you couldn't undo
it if there were any files present that corresponding to the sparse
patterns (for fear they'd be overwritten -- however, every once in a
while someone tried to desparsify, it failed e.g. due to the disk
becoming full, and then after freeing up space there were zillions of
files that exactly matched what de-sparsifying would have put there
but the command wanted the user to manually delete them first.)

> +
> +Then you can disable sparse checkout. Sparse checkout support in 'git
> +read-tree' and similar commands is disabled by default. You need to
> +set `core.sparseCheckout` to `true` in order to have sparse checkout
> +support.

...and get rid of this paragraph because I'd expect git
sparse-checkout to come with a subcommand (init/add/whatever) to set
this for the user?  Unless maybe you want to add some words about why
the command sets core.sparseCheckout...and related workspace related
stuff as we talked about elsewhere.

> +test_expect_success 'git sparse-checkout list (empty)' '
> +       git -C repo sparse-checkout list >list 2>err &&
> +       test_line_count = 0 list &&
> +       test_i18ngrep "failed to parse sparse-checkout file; it may not exist" err

Is that the error we want, rather than something like "This worktree
is not sparse (no sparse-checkout file exists and core.sparseCheckout
is false"?

> +'
> +
> +test_expect_success 'git sparse-checkout list (populated)' '
> +       test_when_finished rm -f repo/.git/info/sparse-checkout &&
> +       cat >repo/.git/info/sparse-checkout <<-EOF &&
> +               /folder1/*
> +               /deep/
> +               **/a
> +               !*bin*
> +       EOF
> +       git -C repo sparse-checkout list >list &&
> +       cat >expect <<-EOF &&
> +               /folder1/*
> +               /deep/
> +               **/a
> +               !*bin*
> +       EOF
> +       test_cmp expect list

I have a `./sparsify --stats` that reports
   You are now in a sparse checkout with only 3499 of the 53625 files.
or
  You are not in a sparse checkout.

and I have a --info option that reports both the stats and the list of
sparse paths similar to this sparse-checkout list command.  The stats
aren't important, I guess, but seem nice for the user.  I don't know
if you want to include anything like that in this or another command,
but just thought I'd mention it.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 2/9] sparse-checkout: create 'init' subcommand
  2019-08-20 15:11 ` [PATCH 2/9] sparse-checkout: create 'init' subcommand Derrick Stolee via GitGitGadget
@ 2019-08-23 23:02   ` Elijah Newren
  2019-09-11 14:27     ` Derrick Stolee
  2019-09-11 20:28     ` Derrick Stolee
  0 siblings, 2 replies; 196+ messages in thread
From: Elijah Newren @ 2019-08-23 23:02 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Tue, Aug 20, 2019 at 8:13 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> Getting started with a sparse-checkout file can be daunting. Help
> users start their sparse enlistment using 'git sparse-checkout init'.
> This will set 'core.sparseCheckout=true' in their config, write
> an initial set of patterns to the sparse-checkout file, and update
> their working directory.
>
> Using 'git read-tree' to clear directories does not work cleanly
> on Windows, so manually delete directories that are tracked by Git
> before running read-tree.

Is that a bug in read-tree that needs to be fixed?

> The use of running another process for 'git read-tree' is likely
> suboptimal, but that can be improved in a later change, if valuable.

I think it's valuable.  The bigger problem may be that "git read-tree
-mu HEAD" may turn out to be insufficient for our needs; see below....

>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  Documentation/git-sparse-checkout.txt |   7 ++
>  builtin/sparse-checkout.c             | 106 +++++++++++++++++++++++++-
>  t/t1091-sparse-checkout-builtin.sh    |  40 ++++++++++
>  3 files changed, 152 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
> index ca0ca6a12f..50c53ee60a 100644
> --- a/Documentation/git-sparse-checkout.txt
> +++ b/Documentation/git-sparse-checkout.txt
> @@ -26,6 +26,13 @@ COMMANDS
>  'list'::
>         Provide a list of the contents in the sparse-checkout file.
>
> +'init'::
> +       Enable the `core.sparseCheckout` setting. If the
> +       sparse-checkout file does not exist, then populate it with
> +       patterns that match every file in the root directory and
> +       no other directories, then will remove all directories tracked
> +       by Git. Add patterns to the sparse-checkout file to
> +       repopulate the working directory.
>
>  SPARSE CHECKOUT
>  ----------------
> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
> index 6477a6ed9c..86d24e6295 100644
> --- a/builtin/sparse-checkout.c
> +++ b/builtin/sparse-checkout.c
> @@ -8,7 +8,7 @@
>  #include "strbuf.h"
>
>  static char const * const builtin_sparse_checkout_usage[] = {
> -       N_("git sparse-checkout [list]"),
> +       N_("git sparse-checkout [init|list]"),
>         NULL
>  };
>
> @@ -64,6 +64,108 @@ static int sparse_checkout_list(int argc, const char **argv)
>         return 0;
>  }
>
> +static int sc_read_tree(void)
> +{
> +       struct argv_array argv = ARGV_ARRAY_INIT;
> +       int result = 0;
> +       argv_array_pushl(&argv, "read-tree", "-m", "-u", "HEAD", NULL);
> +
> +       if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
> +               error(_("failed to update index with new sparse-checkout paths"));
> +               result = 1;
> +       }

`git read-tree -m -u HEAD` will fail if the index has any higher stage
entries in it, even if those higher stage entries correspond to files
which are included in the sparseness patterns and thus would not need
an update.  It might be nice if we can find a way to provide a better
error message, and/or implement the read-tree -m -u HEAD internally in
a way that will allow us to not fail if the conflicted files are
included in the sparse set.

> +
> +       argv_array_clear(&argv);
> +       return result;
> +}
> +
> +static int sc_enable_config(void)
> +{
> +       struct argv_array argv = ARGV_ARRAY_INIT;
> +       int result = 0;
> +       argv_array_pushl(&argv, "config", "--add", "core.sparseCheckout", "true", NULL);

Why --add?  That seems really odd to me.

This should also have "--worktree".  And this function should either
set extensions.worktreeConfig to true or die if it isn't already set;
not sure which.  There's some UI and documentation stuff to figure out
here...

> +
> +       if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
> +               error(_("failed to enable core.sparseCheckout"));
> +               result = 1;
> +       }
> +
> +       argv_array_clear(&argv);
> +       return result;
> +}
> +
> +static int delete_directory(const struct object_id *oid, struct strbuf *base,
> +               const char *pathname, unsigned mode, int stage, void *context)
> +{
> +       struct strbuf dirname = STRBUF_INIT;
> +       struct stat sb;
> +
> +       strbuf_addstr(&dirname, the_repository->worktree);
> +       strbuf_addch(&dirname, '/');
> +       strbuf_addstr(&dirname, pathname);
> +
> +       if (stat(dirname.buf, &sb) || !(sb.st_mode & S_IFDIR))
> +               return 0;
> +
> +       if (remove_dir_recursively(&dirname, 0))

flags = 0 implies not REMOVE_DIR_EMPTY_ONLY.  I'm not familiar with
remove_dir_recursively(), but won't this delete everything...including
untracked files?  If so, that sounds like a bug.

> +               warning(_("failed to remove directory '%s'"),
> +                       dirname.buf);
> +
> +       strbuf_release(&dirname);
> +       return 0;
> +}
> +
> +static int sparse_checkout_init(int argc, const char **argv)
> +{
> +       struct tree *t;
> +       struct object_id oid;
> +       struct exclude_list el;
> +       static struct pathspec pathspec;
> +       char *sparse_filename;
> +       FILE *fp;
> +       int res;
> +
> +       if (sc_enable_config())
> +               return 1;
> +
> +       memset(&el, 0, sizeof(el));
> +
> +       sparse_filename = get_sparse_checkout_filename();
> +       res = add_excludes_from_file_to_list(sparse_filename, "", 0, &el, NULL);

But 'el' isn't used again?  Why are we getting the list of files from
sparse_filename then?

> +
> +       /* If we already have a sparse-checkout file, use it. */
> +       if (res >= 0) {
> +               free(sparse_filename);
> +               goto reset_dir;
> +       }
> +
> +       /* initial mode: all blobs at root */
> +       fp = fopen(sparse_filename, "w");
> +       free(sparse_filename);
> +       fprintf(fp, "/*\n!/*/*\n");
> +       fclose(fp);

Makes sense.

> +
> +       /* remove all directories in the root, if tracked by Git */
> +       if (get_oid("HEAD", &oid)) {
> +               /* assume we are in a fresh repo */
> +               return 0;
> +       }
> +
> +       t = parse_tree_indirect(&oid);
> +
> +       parse_pathspec(&pathspec, PATHSPEC_ALL_MAGIC &
> +                                 ~(PATHSPEC_FROMTOP | PATHSPEC_LITERAL),
> +                      PATHSPEC_PREFER_CWD,
> +                      "", NULL);
> +
> +       if (read_tree_recursive(the_repository, t, "", 0, 0, &pathspec,
> +                               delete_directory, NULL))
> +               return 1;

Since this is only needed on Windows, as per your commit message,
should it be #ifdef'd?  Or is this actually a bug that should be fixed
in "git read-tree -mu HEAD"?

> +
> +reset_dir:
> +       return sc_read_tree();
> +}
> +

The rest looks fine.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 3/9] clone: add --sparse mode
  2019-08-20 15:11 ` [PATCH 3/9] clone: add --sparse mode Derrick Stolee via GitGitGadget
@ 2019-08-23 23:17   ` Elijah Newren
  2019-09-18 13:51     ` Derrick Stolee
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-08-23 23:17 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> When someone wants to clone a large repository, but plans to work
> using a sparse-checkout file, they either need to do a full
> checkout first and then reduce the patterns they included, or
> clone with --no-checkout, set up their patterns, and then run
> a checkout manually. This requires knowing a lot about the repo
> shape and how sparse-checkout works.
>
> Add a new '--sparse' option to 'git clone' that initializes the
> sparse-checkout file to include the following patterns:
>
>         /*
>         !/*/*
>
> These patterns include every file in the root directory, but
> no directories. This allows a repo to include files like a
> README or a bootstrapping script to grow enlistments from that
> point.

Nice.

>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  Documentation/git-clone.txt        |  8 +++++++-
>  builtin/clone.c                    | 27 +++++++++++++++++++++++++++
>  t/t1091-sparse-checkout-builtin.sh | 13 +++++++++++++
>  3 files changed, 47 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt
> index 34011c2940..0fe91d2f04 100644
> --- a/Documentation/git-clone.txt
> +++ b/Documentation/git-clone.txt
> @@ -15,7 +15,7 @@ SYNOPSIS
>           [--dissociate] [--separate-git-dir <git dir>]
>           [--depth <depth>] [--[no-]single-branch] [--no-tags]
>           [--recurse-submodules[=<pathspec>]] [--[no-]shallow-submodules]
> -         [--[no-]remote-submodules] [--jobs <n>] [--] <repository>
> +         [--[no-]remote-submodules] [--jobs <n>] [--sparse] [--] <repository>
>           [<directory>]
>
>  DESCRIPTION
> @@ -156,6 +156,12 @@ objects from the source repository into a pack in the cloned repository.
>         used, neither remote-tracking branches nor the related
>         configuration variables are created.
>
> +--sparse::
> +       Initialize the sparse-checkout file so the working
> +       directory starts with only the files in the root
> +       of the repository. The sparse-checkout file can be
> +       modified to grow the working directory as needed.
> +
>  --mirror::
>         Set up a mirror of the source repository.  This implies `--bare`.
>         Compared to `--bare`, `--mirror` not only maps local branches of the
> diff --git a/builtin/clone.c b/builtin/clone.c
> index f665b28ccc..d6d49a73ff 100644
> --- a/builtin/clone.c
> +++ b/builtin/clone.c
> @@ -60,6 +60,7 @@ static const char *real_git_dir;
>  static char *option_upload_pack = "git-upload-pack";
>  static int option_verbosity;
>  static int option_progress = -1;
> +static int option_sparse_checkout;
>  static enum transport_family family;
>  static struct string_list option_config = STRING_LIST_INIT_NODUP;
>  static struct string_list option_required_reference = STRING_LIST_INIT_NODUP;
> @@ -147,6 +148,8 @@ static struct option builtin_clone_options[] = {
>         OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
>         OPT_BOOL(0, "remote-submodules", &option_remote_submodules,
>                     N_("any cloned submodules will use their remote-tracking branch")),
> +       OPT_BOOL(0, "sparse", &option_sparse_checkout,
> +                   N_("initialize sparse-checkout file to include only files at root")),
>         OPT_END()
>  };
>
> @@ -734,6 +737,27 @@ static void update_head(const struct ref *our, const struct ref *remote,
>         }
>  }
>
> +static int git_sparse_checkout_init(const char *repo)
> +{
> +       struct argv_array argv = ARGV_ARRAY_INIT;
> +       int result = 0;
> +       argv_array_pushl(&argv, "-C", repo, "sparse-checkout", "init", NULL);
> +
> +       /*
> +        * We must apply the setting in the current process
> +        * for the later checkout to use the sparse-checkout file.
> +        */
> +       core_apply_sparse_checkout = 1;
> +
> +       if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
> +               error(_("failed to initialize sparse-checkout"));
> +               result = 1;
> +       }

Sigh...so much forking of additional processes.  I'd really rather
that we were reducing how much of this we are doing in the codebase
instead of adding more.  Every fork makes following stuff in a
debugger harder.

> +
> +       argv_array_clear(&argv);
> +       return result;
> +}
> +
>  static int checkout(int submodule_progress)
>  {
>         struct object_id oid;
> @@ -1107,6 +1131,9 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
>         if (option_required_reference.nr || option_optional_reference.nr)
>                 setup_reference();
>
> +       if (option_sparse_checkout && git_sparse_checkout_init(repo))
> +               return 1;
> +
>         remote = remote_get(option_origin);
>
>         strbuf_addf(&default_refspec, "+%s*:%s*", src_ref_prefix,
> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> index 35ab84aabd..b7d5f15830 100755
> --- a/t/t1091-sparse-checkout-builtin.sh
> +++ b/t/t1091-sparse-checkout-builtin.sh
> @@ -87,4 +87,17 @@ test_expect_success 'init with existing sparse-checkout' '
>         test_cmp expect dir
>  '
>
> +test_expect_success 'clone --sparse' '
> +       git clone --sparse repo clone &&
> +       git -C clone sparse-checkout list >actual &&
> +       cat >expect <<-EOF &&
> +               /*
> +               !/*/*
> +       EOF
> +       test_cmp expect actual &&
> +       ls clone >dir &&
> +       echo a >expect &&
> +       test_cmp expect dir

Checking that a toplevel entry is present, but not checking that an
entry from a subdir is missing as expected?

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 4/9] sparse-checkout: 'add' subcommand
  2019-08-20 15:11 ` [PATCH 4/9] sparse-checkout: 'add' subcommand Derrick Stolee via GitGitGadget
@ 2019-08-23 23:30   ` Elijah Newren
  2019-09-18 13:55     ` Derrick Stolee
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-08-23 23:30 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> The 'git sparse-checkout add' subcommand takes a list of patterns
> over stdin and writes them to the sparse-checkout file. Then, it
> updates the working directory using 'git read-tree -mu HEAD'.

As mentioned in response to the cover letter, I'd rather see it take
patterns as positional arguments (though requiring a '--' argument
before any patterns that start with a hyphen).  It could also take
--stdin to read from stdin.

> Note: if a user adds a negative pattern that would lead to the
> removal of a non-empty directory, then Git may not delete that
> directory (on Windows).

This sounds like you're re-iterating a bug mentioned earlier, but if
someone in the future comes and reads this comment it might sound like
you're saying git can avoid clearing a directory for optimization or
other reasons.  (And, of course, it'd be nice to figure out why this
bug exists.)

Another question this brings up, though, is that you worked around
this bug in 'init' so why would you not also do so for 'add'?  Seems
inconsistent to me.

> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  Documentation/git-sparse-checkout.txt |  4 ++++
>  builtin/sparse-checkout.c             | 32 ++++++++++++++++++++++++++-
>  t/t1091-sparse-checkout-builtin.sh    | 20 +++++++++++++++++
>  3 files changed, 55 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
> index 50c53ee60a..6f540a3443 100644
> --- a/Documentation/git-sparse-checkout.txt
> +++ b/Documentation/git-sparse-checkout.txt
> @@ -34,6 +34,10 @@ COMMANDS
>         by Git. Add patterns to the sparse-checkout file to
>         repopulate the working directory.
>
> +'add'::
> +       Add a set of patterns to the sparse-checkout file, as given over
> +       stdin. Updates the working directory to match the new patterns.
> +
>  SPARSE CHECKOUT
>  ----------------
>
> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
> index 86d24e6295..ec6134fecc 100644
> --- a/builtin/sparse-checkout.c
> +++ b/builtin/sparse-checkout.c
> @@ -8,7 +8,7 @@
>  #include "strbuf.h"
>
>  static char const * const builtin_sparse_checkout_usage[] = {
> -       N_("git sparse-checkout [init|list]"),
> +       N_("git sparse-checkout [init|add|list]"),
>         NULL
>  };
>
> @@ -166,6 +166,34 @@ static int sparse_checkout_init(int argc, const char **argv)
>         return sc_read_tree();
>  }
>
> +static int sparse_checkout_add(int argc, const char **argv)
> +{
> +       struct exclude_list el;
> +       char *sparse_filename;
> +       FILE *fp;
> +       struct strbuf line = STRBUF_INIT;
> +
> +       memset(&el, 0, sizeof(el));
> +
> +       sparse_filename = get_sparse_checkout_filename();
> +       add_excludes_from_file_to_list(sparse_filename, "", 0, &el, NULL);

el is an exclude_list and we call add_excludes_..., but it's actually
an *include* list.  This is going to cause errors at some point, and
will cause lots of headaches.

> +
> +       fp = fopen(sparse_filename, "w");
> +       write_excludes_to_file(fp, &el);
> +
> +       while (!strbuf_getline(&line, stdin)) {
> +               strbuf_trim(&line);
> +               fprintf(fp, "%s\n", line.buf);
> +       }

Should we first check whether these excludes are already in the
sparse-checkout file?

> +       fclose(fp);
> +       free(sparse_filename);
> +
> +       clear_exclude_list(&el);
> +
> +       return sc_read_tree();

What if someone calls 'git sparse-checkout add' without first calling
'git sparse-checkout init'?  As far as I can tell, core.sparseCheckout
will be unset (i.e. treated as false), meaning that this operation
will do some work, but result in no changes and a report of success.
After users try to figure out why it won't work, they eventually run
'git sparse-checkout init', which will delete all the entries they
previously added with the 'add' subcommand.

What should happen instead?

> +}
> +
>  int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
>  {
>         static struct option builtin_sparse_checkout_options[] = {
> @@ -187,6 +215,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
>                         return sparse_checkout_list(argc, argv);
>                 if (!strcmp(argv[0], "init"))
>                         return sparse_checkout_init(argc, argv);
> +               if (!strcmp(argv[0], "add"))
> +                       return sparse_checkout_add(argc, argv);
>         }
>
>         usage_with_options(builtin_sparse_checkout_usage,
> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> index b7d5f15830..499bd8d6d0 100755
> --- a/t/t1091-sparse-checkout-builtin.sh
> +++ b/t/t1091-sparse-checkout-builtin.sh
> @@ -100,4 +100,24 @@ test_expect_success 'clone --sparse' '
>         test_cmp expect dir
>  '
>
> +test_expect_success 'add to existing sparse-checkout' '
> +       echo "/folder2/*" | git -C repo sparse-checkout add &&

I've always been using '/folder2/' in sparse-checkout, without the
trailing asterisk.  That seems more friendly for cone mode too.  Are
there benefits to keeping the trailing asterisk?

> +       cat >expect <<-EOF &&
> +               /*
> +               !/*/*
> +               /folder1/*
> +               /folder2/*
> +       EOF
> +       git -C repo sparse-checkout list >actual &&
> +       test_cmp expect actual &&
> +       test_cmp expect repo/.git/info/sparse-checkout &&
> +       ls repo >dir  &&
> +       cat >expect <<-EOF &&
> +               a
> +               folder1
> +               folder2
> +       EOF
> +       test_cmp expect dir
> +'
> +
>  test_done
> \ No newline at end of file
> --
> gitgitgadget
>

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 5/9] sparse-checkout: create 'disable' subcommand
  2019-08-20 15:11 ` [PATCH 5/9] sparse-checkout: create 'disable' subcommand Derrick Stolee via GitGitGadget
@ 2019-08-23 23:50   ` Elijah Newren
  0 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-08-23 23:50 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Tue, Aug 20, 2019 at 8:14 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> The instructions for disabling a sparse-checkout to a full
> working directory are complicated and non-intuitive. Add a
> subcommand, 'git sparse-checkout disable', to perform those
> steps for the user.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  Documentation/git-sparse-checkout.txt | 26 +++++++---------
>  builtin/sparse-checkout.c             | 45 ++++++++++++++++++++++++---
>  t/t1091-sparse-checkout-builtin.sh    | 15 +++++++++
>  3 files changed, 67 insertions(+), 19 deletions(-)
>
> diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
> index 6f540a3443..de04b768ae 100644
> --- a/Documentation/git-sparse-checkout.txt
> +++ b/Documentation/git-sparse-checkout.txt
> @@ -38,6 +38,10 @@ COMMANDS
>         Add a set of patterns to the sparse-checkout file, as given over
>         stdin. Updates the working directory to match the new patterns.
>
> +'disable'::
> +       Remove the sparse-checkout file, set `core.sparseCheckout` to
> +       `false`, and restore the working directory to include all files.
> +
>  SPARSE CHECKOUT
>  ----------------
>
> @@ -60,6 +64,13 @@ Then it compares the new skip-worktree value with the previous one. If
>  skip-worktree turns from set to unset, it will add the corresponding
>  file back. If it turns from unset to set, that file will be removed.
>
> +To repopulate the working directory with all files, use the
> +`git sparse-checkout disable` command.

Make sense.

> +
> +Sparse checkout support in 'git read-tree' and similar commands is
> +disabled by default. You need to set `core.sparseCheckout` to `true`
> +in order to have sparse checkout support.

But why add this paragraph?  read-tree is plumbing that we'd rather
not force on users, so I'd rather not mention it.  And I thought we
were setting core.sparseCheckout for the user.

> -Another tricky thing is fully repopulating the working directory when you
> -no longer want sparse checkout. You cannot just disable "sparse
> -checkout" because skip-worktree bits are still in the index and your working
> -directory is still sparsely populated. You should re-populate the working
> -directory with the `$GIT_DIR/info/sparse-checkout` file content as
> -follows:
> -
> -----------------
> -/*
> -----------------
> -
> -Then you can disable sparse checkout. Sparse checkout support in 'git
> -read-tree' and similar commands is disabled by default. You need to
> -set `core.sparseCheckout` to `true` in order to have sparse checkout
> -support.

Wahoo!


> -static int sc_enable_config(void)
> +static int sc_set_config(int mode)
>  {
>         struct argv_array argv = ARGV_ARRAY_INIT;
>         int result = 0;
> -       argv_array_pushl(&argv, "config", "--add", "core.sparseCheckout", "true", NULL);
> +       argv_array_pushl(&argv, "config", "--add", "core.sparseCheckout", NULL);

Remove "--add" and add "--worktree".

> +
> +       switch (mode) {
> +       case 1:
> +               argv_array_pushl(&argv, "true", NULL);
> +               break;
> +
> +       case 0:
> +               argv_array_pushl(&argv, "false", NULL);
> +               break;
> +
> +       default:
> +               die(_("invalid config mode"));
> +       }
>
>         if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {

Can't we use git_config_set instead of run_command_v_opt?

>                 error(_("failed to enable core.sparseCheckout"));
> @@ -125,7 +138,7 @@ static int sparse_checkout_init(int argc, const char **argv)
>         FILE *fp;
>         int res;
>
> -       if (sc_enable_config())
> +       if (sc_set_config(1))

Random idea: The code would be more self-documenting, if you just made
sc_set_config take a string and you passed either "true" or "false".
Then you could also dispense with the switch statement.  Yes, you can
make typos, but there are only going to be 2-3 places in the code that
call it, right?

>                 return 1;
>
>         memset(&el, 0, sizeof(el));
> @@ -194,6 +207,28 @@ static int sparse_checkout_add(int argc, const char **argv)
>         return sc_read_tree();
>  }
>
> +static int sparse_checkout_disable(int argc, const char **argv)
> +{
> +       char *sparse_filename;
> +       FILE *fp;
> +
> +       if (sc_set_config(1))
> +               die(_("failed to change config"));

Ooh, preventative setting.  This'll probably save some grief for
people who get themselves in a weird state and start mucking around
manually; very nice.

> +
> +       sparse_filename = get_sparse_checkout_filename();
> +       fp = fopen(sparse_filename, "w");
> +       fprintf(fp, "/*\n");
> +       fclose(fp);
> +
> +       if (sc_read_tree())
> +               die(_("error while refreshing working directory"));

This can fail e.g. if there are conflicts in the index.  Luckily, if
they fix them up and then re-run "git sparse-checkout disable" then it
can work.  But you've already changed the
$GIT_DIR/info/sparse-checkout file to a state that won't match what
they actually have checked out.  Will that cause any problems?  (I
can't think of any immediately, but I am curious.)

Also, should or can we give a better error message, such as
instructing them to try again after they've fixed any problems
reported by the sc_read_tree() operation?

> +
> +       unlink(sparse_filename);
> +       free(sparse_filename);
> +
> +       return sc_set_config(0);
> +}
> +
>  int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
>  {
>         static struct option builtin_sparse_checkout_options[] = {
> @@ -217,6 +252,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
>                         return sparse_checkout_init(argc, argv);
>                 if (!strcmp(argv[0], "add"))
>                         return sparse_checkout_add(argc, argv);
> +               if (!strcmp(argv[0], "disable"))
> +                       return sparse_checkout_disable(argc, argv);
>         }
>
>         usage_with_options(builtin_sparse_checkout_usage,
> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> index 499bd8d6d0..68ca63a6f6 100755
> --- a/t/t1091-sparse-checkout-builtin.sh
> +++ b/t/t1091-sparse-checkout-builtin.sh
> @@ -120,4 +120,19 @@ test_expect_success 'add to existing sparse-checkout' '
>         test_cmp expect dir
>  '
>
> +test_expect_success 'sparse-checkout disable' '
> +       git -C repo sparse-checkout disable &&
> +       test_path_is_missing repo/.git/info/sparse-checkout &&
> +       git -C repo config --list >config &&

Should this command also have a --worktree flag (just for consistency,
even if we know there will only be one worktree so it doesn't matter
in practice)?  There are probably similar cases I overlooked on the
previous patches.

> +       test_i18ngrep "core.sparsecheckout=false" config &&
> +       ls repo >dir &&
> +       cat >expect <<-EOF &&
> +               a
> +               deep
> +               folder1
> +               folder2
> +       EOF
> +       test_cmp expect dir
> +'
> +
>  test_done
> \ No newline at end of file
> --
> gitgitgadget
>

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 6/9] trace2:experiment: clear_ce_flags_1
  2019-08-20 15:11 ` [PATCH 6/9] trace2:experiment: clear_ce_flags_1 Jeff Hostetler via GitGitGadget
@ 2019-08-24  0:08   ` Elijah Newren
  0 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-08-24  0:08 UTC (permalink / raw)
  To: Jeff Hostetler via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Jeff Hostetler

On Tue, Aug 20, 2019 at 8:12 AM Jeff Hostetler via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Jeff Hostetler <jeffhost@microsoft.com>

Can the commit summary be turned into English?

> The clear_ce_flags_1 method is used by many types of calls to
> unpack_trees(). Add trace2 regions around the method, including
> some flag information, so we can get granular performance data
> during experiments.

It might be nice to have some words in the cover letter about why this
patch is included in this series instead of being a separate
submission.  I'm not familiar with the trace2 stuff yet; this looks
probably useful, but the commit message makes it sound like something
general rather than specific to this series.

> Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
<snip>

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 7/9] sparse-checkout: add 'cone' mode
  2019-08-20 15:11 ` [PATCH 7/9] sparse-checkout: add 'cone' mode Derrick Stolee via GitGitGadget
@ 2019-08-24  0:31   ` Elijah Newren
  0 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-08-24  0:31 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Tue, Aug 20, 2019 at 8:13 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> The sparse-checkout feature can have quadratic performance as
> the number of patterns and number of entries in the index grow.
> If there are 1,000 patterns and 1,000,000 entries, this time can
> be very significant.
>
> Create a new 'cone' mode for the core.sparseCheckout config
> option, and adjust the parser to set an appropriate enum value.
>
> While adjusting the type of this variable, rename it from
> core_apply_sparse_checkout to core_sparse_checkout. This will
> help avoid parallel changes from hitting type issues, and we
> can guarantee that all uses now consider the enum values instead
> of the int value.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  Documentation/config/core.txt         |  7 ++--
>  Documentation/git-sparse-checkout.txt | 50 +++++++++++++++++++++++++++
>  builtin/clone.c                       |  2 +-
>  builtin/sparse-checkout.c             | 16 +++++----
>  cache.h                               |  8 ++++-
>  config.c                              | 10 +++++-
>  environment.c                         |  2 +-
>  t/t1091-sparse-checkout-builtin.sh    | 14 ++++++++
>  unpack-trees.c                        |  2 +-
>  9 files changed, 98 insertions(+), 13 deletions(-)
>
> diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
> index 75538d27e7..9b8ab2a6d4 100644
> --- a/Documentation/config/core.txt
> +++ b/Documentation/config/core.txt
> @@ -591,8 +591,11 @@ core.multiPackIndex::
>         multi-pack-index design document].
>
>  core.sparseCheckout::
> -       Enable "sparse checkout" feature. See section "Sparse checkout" in
> -       linkgit:git-read-tree[1] for more information.
> +       Enable "sparse checkout" feature. If "false", then sparse-checkout
> +       is disabled. If "true", then sparse-checkout is enabled with the full
> +       .gitignore pattern set. If "cone", then sparse-checkout is enabled with
> +       a restricted pattern set. See linkgit:git-sparse-checkout[1] for more
> +       information.
>
>  core.abbrev::
>         Set the length object names are abbreviated to.  If
> diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
> index de04b768ae..463319055b 100644
> --- a/Documentation/git-sparse-checkout.txt
> +++ b/Documentation/git-sparse-checkout.txt
> @@ -86,6 +86,56 @@ negate patterns. For example, to remove the file `unwanted`:
>  ----------------
>
>
> +## CONE PATTERN SET
> +
> +The full pattern set allows for arbitrary pattern matches and complicated
> +inclusion/exclusion rules. These can result in O(N*M) pattern matches when
> +updating the index, where N is the number of patterns and M is the number
> +of paths in the index. To combat this performance issue, a more restricted
> +pattern set is allowed when `core.spareCheckout` is set to `cone`.
> +
> +The accepted patterns in the cone pattern set are:
> +
> +1. *Recursive:* All paths inside a directory are included.
> +
> +2. *Parent:* All files immediately inside a directory are included.
> +
> +In addition to the above two patterns, we also expect that all files in the
> +root directory are included. If a recursive pattern is added, then all
> +leading directories are added as parent patterns.
> +
> +By default, when running `git sparse-checkout init`, the root directory is
> +added as a parent pattern. At this point, the sparse-checkout file contains
> +the following patterns:
> +
> +```
> +/*
> +!/*/*
> +```
> +
> +This says "include everything in root, but nothing two levels below root."

...but nothing at the level below root...?

> +If we then add the folder `A/B/C` as a recursive pattern, the folders `A` and
> +`A/B` are added as parent patterns. The resulting sparse-checkout file is
> +now
> +
> +```
> +/*
> +!/*/*
> +/A/*
> +!/A/*/*
> +/A/B/*
> +!/A/B/*/*
> +/A/B/C/*
> +```

Can we dispense with the trailing asterisks (other than on the first
line for the root level)?  This reads a lot cleaner to me:

```
/*
!/*/
/A/
!/A/*/
/A/B/
!/A/B/*/
/A/B/C/
```

We could also dispense with the trailing '/' on the inclusion lines
from this version, but I'm not sure that helps.

> +
> +Here, order matters, so the negative patterns are overridden by the positive
> +patterns that appear lower in the file.
> +
> +If `core.sparseCheckout=cone`, then Git will parse the sparse-checkout file
> +expecting patterns of these types. Git will warn if the patterns do not match.
> +If the patterns do match the expected format, then Git will use faster hash-
> +based algorithms to compute inclusion in the sparse-checkout.
> +
>  SEE ALSO
>  --------
>
> diff --git a/builtin/clone.c b/builtin/clone.c
> index d6d49a73ff..763898ada5 100644
> --- a/builtin/clone.c
> +++ b/builtin/clone.c
> @@ -747,7 +747,7 @@ static int git_sparse_checkout_init(const char *repo)
>          * We must apply the setting in the current process
>          * for the later checkout to use the sparse-checkout file.
>          */
> -       core_apply_sparse_checkout = 1;
> +       core_sparse_checkout = SPARSE_CHECKOUT_FULL;
>
>         if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
>                 error(_("failed to initialize sparse-checkout"));
> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
> index 8f97c27ec7..77e5235720 100644
> --- a/builtin/sparse-checkout.c
> +++ b/builtin/sparse-checkout.c
> @@ -79,18 +79,22 @@ static int sc_read_tree(void)
>         return result;
>  }
>
> -static int sc_set_config(int mode)
> +static int sc_set_config(enum sparse_checkout_mode mode)
>  {
>         struct argv_array argv = ARGV_ARRAY_INIT;
>         int result = 0;
>         argv_array_pushl(&argv, "config", "--add", "core.sparseCheckout", NULL);
>
>         switch (mode) {
> -       case 1:
> +       case SPARSE_CHECKOUT_FULL:
>                 argv_array_pushl(&argv, "true", NULL);
>                 break;
>
> -       case 0:
> +       case SPARSE_CHECKOUT_CONE:
> +               argv_array_pushl(&argv, "cone", NULL);
> +               break;
> +
> +       case SPARSE_CHECKOUT_NONE:
>                 argv_array_pushl(&argv, "false", NULL);
>                 break;
>
> @@ -138,7 +142,7 @@ static int sparse_checkout_init(int argc, const char **argv)
>         FILE *fp;
>         int res;
>
> -       if (sc_set_config(1))
> +       if (sc_set_config(SPARSE_CHECKOUT_FULL))

Going back to my comment on the previous patch, perhaps
SPARSE_CHECKOUT_FULL could be the string "true", so you can avoid the
switch statement?

>                 return 1;
>
>         memset(&el, 0, sizeof(el));
> @@ -212,7 +216,7 @@ static int sparse_checkout_disable(int argc, const char **argv)
>         char *sparse_filename;
>         FILE *fp;
>
> -       if (sc_set_config(1))
> +       if (sc_set_config(SPARSE_CHECKOUT_FULL))
>                 die(_("failed to change config"));
>
>         sparse_filename = get_sparse_checkout_filename();
> @@ -226,7 +230,7 @@ static int sparse_checkout_disable(int argc, const char **argv)
>         unlink(sparse_filename);
>         free(sparse_filename);
>
> -       return sc_set_config(0);
> +       return sc_set_config(SPARSE_CHECKOUT_NONE);
>  }
>
>  int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
> diff --git a/cache.h b/cache.h
> index b1da1ab08f..4426816ca1 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -865,12 +865,18 @@ extern char *git_replace_ref_base;
>
>  extern int fsync_object_files;
>  extern int core_preload_index;
> -extern int core_apply_sparse_checkout;
>  extern int precomposed_unicode;
>  extern int protect_hfs;
>  extern int protect_ntfs;
>  extern const char *core_fsmonitor;
>
> +enum sparse_checkout_mode {
> +       SPARSE_CHECKOUT_NONE = 0,
> +       SPARSE_CHECKOUT_FULL = 1,
> +       SPARSE_CHECKOUT_CONE = 2,
> +};
> +enum sparse_checkout_mode core_sparse_checkout;
> +
>  /*
>   * Include broken refs in all ref iterations, which will
>   * generally choke dangerous operations rather than letting

Wait, you're not changing the add command?  So the cone mode just

> diff --git a/config.c b/config.c
> index 3900e4947b..15b7a20dd9 100644
> --- a/config.c
> +++ b/config.c
> @@ -1360,7 +1360,15 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
>         }
>
>         if (!strcmp(var, "core.sparsecheckout")) {
> -               core_apply_sparse_checkout = git_config_bool(var, value);
> +               int result = git_parse_maybe_bool(value);
> +
> +               if (result < 0) {
> +                       core_sparse_checkout = SPARSE_CHECKOUT_NONE;
> +
> +                       if (!strcasecmp(value, "cone"))
> +                               core_sparse_checkout = SPARSE_CHECKOUT_CONE;
> +               } else
> +                       core_sparse_checkout = result;
>                 return 0;
>         }
>
> diff --git a/environment.c b/environment.c
> index 89af47cb85..cc12e30bd6 100644
> --- a/environment.c
> +++ b/environment.c
> @@ -68,7 +68,7 @@ enum push_default_type push_default = PUSH_DEFAULT_UNSPECIFIED;
>  enum object_creation_mode object_creation_mode = OBJECT_CREATION_MODE;
>  char *notes_ref_name;
>  int grafts_replace_parents = 1;
> -int core_apply_sparse_checkout;
> +enum sparse_checkout_mode core_sparse_checkout;
>  int merge_log_config = -1;
>  int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */
>  unsigned long pack_size_limit_cfg;
> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> index 68ca63a6f6..8cc377b839 100755
> --- a/t/t1091-sparse-checkout-builtin.sh
> +++ b/t/t1091-sparse-checkout-builtin.sh
> @@ -120,6 +120,20 @@ test_expect_success 'add to existing sparse-checkout' '
>         test_cmp expect dir
>  '
>
> +test_expect_success 'cone mode: match patterns' '
> +       git -C repo config --replace-all core.sparseCheckout cone &&

--replace-all?  This makes me wonder if you were actually doing the
--add in the previous patchsets on purpose.  I'm so confused.

> +       rm -rf repo/a repo/folder1 repo/folder2 &&
> +       git -C repo read-tree -mu HEAD &&
> +       git -C repo reset --hard &&
> +       ls repo >dir  &&
> +       cat >expect <<-EOF &&
> +               a
> +               folder1
> +               folder2
> +       EOF
> +       test_cmp expect dir
> +'
> +
>  test_expect_success 'sparse-checkout disable' '
>         git -C repo sparse-checkout disable &&
>         test_path_is_missing repo/.git/info/sparse-checkout &&
> diff --git a/unpack-trees.c b/unpack-trees.c
> index 8c3b5e8849..289c62305f 100644
> --- a/unpack-trees.c
> +++ b/unpack-trees.c
> @@ -1468,7 +1468,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>
>         trace_performance_enter();
>         memset(&el, 0, sizeof(el));
> -       if (!core_apply_sparse_checkout || !o->update)
> +       if (!core_sparse_checkout || !o->update)
>                 o->skip_sparse_checkout = 1;
>         if (!o->skip_sparse_checkout) {
>                 char *sparse = git_pathdup("info/sparse-checkout");
> --
> gitgitgadget

Wait...I didn't see anything checking the value of "cone" and using
it, it only has an ability to set it.  What's the point?  Or is that
going to come in a later patch?  (If it does, should the commit
message mention that?)

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 8/9] sparse-checkout: use hashmaps for cone patterns
  2019-08-20 15:11 ` [PATCH 8/9] sparse-checkout: use hashmaps for cone patterns Derrick Stolee via GitGitGadget
@ 2019-08-24  4:56   ` Elijah Newren
  0 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-08-24  4:56 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> The parent and recursive patterns allowed by the "cone mode"
> option in sparse-checkout are restrictive enough that we
> can avoid using the regex parsing. Everything is based on
> prefix matches, so we can use hashsets to store the prefixes
> from the sparse-checkout file. When checking a path, we can
> strip path entries from the path and check the hashset for
> an exact match.
>
> As a test, I created a cone-mode sparse-checkout file for the
> Linux repository that actually includes every file. This was
> constructed by taking every folder in the Linux repo and creating
> the pattern pairs here:
>
>         /$folder/*
>         !/$folder/*/*
>
> This resulted in a sparse-checkout file sith 8,296 patterns.
> Running 'git read-tree -mu HEAD' on this file had the following
> performance:
>
>         core.sparseCheckout=false: 0.21 s (0.00 s)
>          core.sparseCheckout=true: 3.75 s (3.50 s)
>          core.sparseCheckout=cone: 0.23 s (0.01 s)

Nice!

> The times in parentheses above correspond to the time spent
> in the first clear_ce_flags() call, according to the trace2
> performance traces.
>
> While this example is contrived, it demonstrates how these
> patterns can slow the sparse-checkout feature.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  Documentation/git-sparse-checkout.txt |   1 -
>  dir.c                                 | 154 +++++++++++++++++++++++++-
>  dir.h                                 |  27 +++++
>  t/t1091-sparse-checkout-builtin.sh    |   8 ++
>  4 files changed, 183 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
> index 463319055b..7ade827370 100644
> --- a/Documentation/git-sparse-checkout.txt
> +++ b/Documentation/git-sparse-checkout.txt
> @@ -85,7 +85,6 @@ negate patterns. For example, to remove the file `unwanted`:
>  !unwanted
>  ----------------
>
> -
>  ## CONE PATTERN SET
>
>  The full pattern set allows for arbitrary pattern matches and complicated
> diff --git a/dir.c b/dir.c
> index d021c908e5..2c5ff89a72 100644
> --- a/dir.c
> +++ b/dir.c
> @@ -599,6 +599,99 @@ void parse_exclude_pattern(const char **pattern,
>         *patternlen = len;
>  }
>
> +static int el_hashmap_cmp(const void *unused_cmp_data,
> +                         const void *a, const void *b, const void *key)
> +{
> +       const struct exclude_entry *ee1 = a;
> +       const struct exclude_entry *ee2 = b;
> +
> +       return strncmp(ee1->pattern, ee2->pattern, ee1->patternlen);
> +}
> +
> +static void add_exclude_to_hashsets(struct exclude_list *el, struct exclude *x)

Is this for something that is logically an "include" or an "exclude"?
The earlier use of "exclude" for "include"s now how me totally confused
and makes me know I'll have to read the code really carefully and may
still get messed up.

I don't know if there's a better way, but re-using .gitignore internals
for sparse-checkout stuff leads to this inverted switcheroo and makes
things hard to follow.  Is it too late or too hard to easily rename the
"exclude" stuff used by gitignore to something else more neutral?
Sigh...

> +{
> +       struct exclude_entry *e;
> +       char *truncated;
> +       char *data = NULL;
> +
> +       if (!el->use_cone_patterns)
> +               return;
> +
> +       if (x->patternlen >= 4 &&
> +           !strcmp(x->pattern + x->patternlen - 4, "/*/*")) {
> +               if (!(x->flags & EXC_FLAG_NEGATIVE)) {
> +                       /* Not a cone pattern. */
> +                       el->use_cone_patterns = 0;
> +                       warning(_("unrecognized pattern: '%s'"), x->pattern);
> +                       goto clear_hashmaps;
> +               }
> +
> +               truncated = xstrdup(x->pattern);
> +               truncated[x->patternlen - 4] = 0;
> +
> +               e = xmalloc(sizeof(struct exclude_entry));
> +               e->pattern = truncated;
> +               e->patternlen = x->patternlen - 4;
> +               hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
> +
> +               if (!hashmap_get(&el->recursive_hashmap, e, NULL)) {
> +                       /* We did not see the "parent" included */
> +                       warning(_("unrecognized negative pattern: '%s'"), x->pattern);
> +                       free(truncated);
> +                       goto clear_hashmaps;
> +               }
> +
> +               hashmap_add(&el->parent_hashmap, e);
> +               hashmap_remove(&el->recursive_hashmap, e, &data);
> +               free(data);
> +               return;
> +       }
> +
> +       if (x->patternlen >= 2 &&
> +           !strcmp(x->pattern + x->patternlen - 2, "/*")) {
> +               if (x->flags & EXC_FLAG_NEGATIVE) {
> +                       warning(_("unrecognized negative pattern: '%s'"), x->pattern);
> +                       goto clear_hashmaps;
> +               }
> +
> +               e = xmalloc(sizeof(struct exclude_entry));
> +
> +               truncated = xstrdup(x->pattern);
> +               truncated[x->patternlen - 2] = 0;
> +               e->pattern = truncated;
> +               e->patternlen = x->patternlen - 2;
> +               hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
> +
> +               hashmap_add(&el->recursive_hashmap, e);
> +
> +               if (hashmap_get(&el->parent_hashmap, e, NULL)) {
> +                       /* we already included this at the parent level */
> +                       warning(_("your sparse-checkout file may have issues: pattern '%s' is repeated"),
> +                               x->pattern);
> +                       hashmap_remove(&el->parent_hashmap, e, &data);
> +                       free(data);
> +               }
> +               return;
> +       }
> +
> +clear_hashmaps:
> +       hashmap_free(&el->parent_hashmap, 1);
> +       hashmap_free(&el->recursive_hashmap, 1);
> +       el->use_cone_patterns = 0;
> +}
> +
> +static int hashmap_contains_path(struct hashmap *map,
> +                                struct strbuf *pattern)
> +{
> +       struct exclude_entry e;
> +
> +       /* Check straight mapping */
> +       e.pattern = pattern->buf;
> +       e.patternlen = pattern->len;
> +       hashmap_entry_init(&e, memhash(e.pattern, e.patternlen));
> +       return !!hashmap_get(map, &e, NULL);
> +}
> +
>  void add_exclude(const char *string, const char *base,
>                  int baselen, struct exclude_list *el, int srcpos)
>  {
> @@ -623,6 +716,8 @@ void add_exclude(const char *string, const char *base,
>         ALLOC_GROW(el->excludes, el->nr + 1, el->alloc);
>         el->excludes[el->nr++] = x;
>         x->el = el;
> +
> +       add_exclude_to_hashsets(el, x);
>  }
>
>  static int read_skip_worktree_file_from_index(const struct index_state *istate,
> @@ -848,6 +943,10 @@ static int add_excludes_from_buffer(char *buf, size_t size,
>         int i, lineno = 1;
>         char *entry;
>
> +       el->use_cone_patterns = core_sparse_checkout == SPARSE_CHECKOUT_CONE ? 1 : 0;
> +       hashmap_init(&el->recursive_hashmap, el_hashmap_cmp, NULL, 0);
> +       hashmap_init(&el->parent_hashmap, el_hashmap_cmp, NULL, 0);
> +
>         el->filebuf = buf;
>
>         if (skip_utf8_bom(&buf, size))
> @@ -1070,18 +1169,61 @@ static struct exclude *last_exclude_matching_from_list(const char *pathname,
>
>  /*
>   * Scan the list and let the last match determine the fate.
> - * Return 1 for exclude, 0 for include and -1 for undecided.
> + * Return 0 for exclude, 1 for include and -1 for undecided.

Um...this doesn't make me feel any better about whether we're going to
run into bugs about "include" vs. "exclude".

>   */
>  int is_excluded_from_list(const char *pathname,
>                           int pathlen, const char *basename, int *dtype,
>                           struct exclude_list *el, struct index_state *istate)
>  {
>         struct exclude *exclude;
> -       exclude = last_exclude_matching_from_list(pathname, pathlen, basename,
> -                                                 dtype, el, istate);
> -       if (exclude)
> -               return exclude->flags & EXC_FLAG_NEGATIVE ? 0 : 1;
> -       return -1; /* undecided */
> +       struct strbuf parent_pathname = STRBUF_INIT;
> +       int result = 0;
> +       const char *slash_pos;
> +
> +       if (!el->use_cone_patterns) {
> +               exclude = last_exclude_matching_from_list(pathname, pathlen, basename,
> +                                                               dtype, el, istate);
> +
> +               if (exclude)
> +                       return exclude->flags & EXC_FLAG_NEGATIVE ? 0 : 1;
> +
> +               return -1; /* undecided */
> +       }
> +
> +       strbuf_addch(&parent_pathname, '/');
> +       strbuf_add(&parent_pathname, pathname, pathlen);
> +       slash_pos = strrchr(parent_pathname.buf, '/');
> +
> +       if (slash_pos == parent_pathname.buf) {
> +               /* include every file in root */
> +               result = 1;
> +               goto done;
> +       }
> +
> +       strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf);
> +
> +       if (hashmap_contains_path(&el->parent_hashmap, &parent_pathname)) {
> +               result = 1;
> +               goto done;
> +       }
> +
> +       while (parent_pathname.len) {
> +               if (hashmap_contains_path(&el->recursive_hashmap,
> +                                         &parent_pathname)) {
> +                       result = -1;
> +                       goto done;
> +               }
> +
> +               slash_pos = strrchr(parent_pathname.buf, '/');
> +               if (slash_pos == parent_pathname.buf)
> +                       break;
> +
> +               strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf);
> +       }
> +
> +done:
> +       strbuf_release(&parent_pathname);
> +       return result;
>  }
>
>  static struct exclude *last_exclude_matching_from_lists(struct dir_struct *dir,
> diff --git a/dir.h b/dir.h
> index 680079bbe3..2d3356d1c0 100644
> --- a/dir.h
> +++ b/dir.h
> @@ -4,6 +4,7 @@
>  /* See Documentation/technical/api-directory-listing.txt */
>
>  #include "cache.h"
> +#include "hashmap.h"
>  #include "strbuf.h"
>
>  struct dir_entry {
> @@ -37,6 +38,13 @@ struct exclude {
>         int srcpos;
>  };
>
> +/* used for hashmaps for cone patterns */
> +struct exclude_entry {
> +       struct hashmap_entry ent;
> +       char *pattern;
> +       size_t patternlen;
> +};
> +
>  /*
>   * Each excludes file will be parsed into a fresh exclude_list which
>   * is appended to the relevant exclude_list_group (either EXC_DIRS or
> @@ -55,6 +63,25 @@ struct exclude_list {
>         const char *src;
>
>         struct exclude **excludes;
> +
> +       /*
> +        * While scanning the excludes, we attempt to match the patterns
> +        * with a more restricted set that allows us to use hashsets for
> +        * matching logic, which is faster than the linear lookup in the
> +        * excludes array above. If non-zero, that check succeeded.
> +        */
> +       unsigned use_cone_patterns;
> +
> +       /*
> +        * Stores paths where everything starting with those paths
> +        * is included.
> +        */
> +       struct hashmap recursive_hashmap;
> +
> +       /*
> +        * Used to check single-level parents of blobs.
> +        */
> +       struct hashmap parent_hashmap;
>  };
>
>  /*
> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> index 8cc377b839..60f10864a1 100755
> --- a/t/t1091-sparse-checkout-builtin.sh
> +++ b/t/t1091-sparse-checkout-builtin.sh
> @@ -134,6 +134,14 @@ test_expect_success 'cone mode: match patterns' '
>         test_cmp expect dir
>  '
>
> +test_expect_success 'cone mode: warn on bad pattern' '
> +       test_when_finished mv sparse-checkout repo/.git/info &&
> +       cp repo/.git/info/sparse-checkout . &&
> +       echo "!/deep/deeper/*" >>repo/.git/info/sparse-checkout &&
> +       git -C repo read-tree -mu HEAD 2>err &&
> +       test_i18ngrep "unrecognized negative pattern" err
> +'
> +
>  test_expect_success 'sparse-checkout disable' '
>         git -C repo sparse-checkout disable &&
>         test_path_is_missing repo/.git/info/sparse-checkout &&
> --
> gitgitgadget

So, uh, I saw the exclude vs. include thing, started scanning to see
if it was going to get better, saw the next include/exclude thing that
triggered a bell, and I more or less just scrolled quickly to the
bottom checking if anything might catch my eye.  So I didn't look at
this patch very closely at all.  I decided to just start playing with
the series instead...and as far as that goes, this patch is probably
fine because my testing seemed to work.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 9/9] sparse-checkout: init and add in cone mode
  2019-08-20 15:11 ` [PATCH 9/9] sparse-checkout: init and add in cone mode Derrick Stolee via GitGitGadget
@ 2019-08-24  5:07   ` Elijah Newren
  0 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-08-24  5:07 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>

Perhaps "sparse-checkout: modify 'init' and 'add' for cone mode" for
the summary?

> From: Derrick Stolee <dstolee@microsoft.com>
>
> To make the cone pattern set easy to use, update the behavior of
> 'git sparse-checkout [init|add]'.

Maybe switch these phrases around (starting with "Update", ending with "use.")

> Add '--cone' flag to 'git sparse-checkout init' to set the config
> option 'core.sparseCheckout=cone'.
>
> When running 'git sparse-checkout add' in cone mode, a user only
> needs to supply a list of recursive folder matches. Git will
> automatically add the necessary parent matches for the leading
> directories.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  builtin/sparse-checkout.c          | 134 +++++++++++++++++++++++++++--
>  t/t1091-sparse-checkout-builtin.sh |  35 ++++++++
>  2 files changed, 164 insertions(+), 5 deletions(-)
>
> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
> index 77e5235720..0a4e101ddd 100644
> --- a/builtin/sparse-checkout.c
> +++ b/builtin/sparse-checkout.c
> @@ -6,15 +6,22 @@
>  #include "repository.h"
>  #include "run-command.h"
>  #include "strbuf.h"
> +#include "string-list.h"
>
>  static char const * const builtin_sparse_checkout_usage[] = {
>         N_("git sparse-checkout [init|add|list|disable]"),
>         NULL
>  };
>
> +static const char * const builtin_sparse_checkout_init_usage[] = {
> +       N_("git sparse-checkout init [--cone]"),
> +       NULL
> +};
> +
>  struct opts_sparse_checkout {
>         const char *subcommand;
>         int read_stdin;
> +       int cone;
>  } opts;
>
>  static char *get_sparse_checkout_filename(void)
> @@ -41,6 +48,60 @@ static void write_excludes_to_file(FILE *fp, struct exclude_list *el)
>         }
>  }
>
> +static void write_cone_to_file(FILE *fp, struct exclude_list *el)
> +{
> +       int i;
> +       struct exclude_entry *entry;
> +       struct hashmap_iter iter;
> +       struct string_list sl = STRING_LIST_INIT_DUP;
> +
> +       hashmap_iter_init(&el->parent_hashmap, &iter);
> +       while ((entry = hashmap_iter_next(&iter))) {
> +               char *pattern = xstrdup(entry->pattern);
> +               char *converted = pattern;
> +               if (pattern[0] == '/')
> +                       converted++;
> +               if (pattern[entry->patternlen - 1] == '/')
> +                       pattern[entry->patternlen - 1] = 0;
> +               string_list_insert(&sl, converted);
> +               free(pattern);
> +       }
> +
> +       string_list_sort(&sl);

I was worried that if someone had a directory named '(parenthetical)'
or '%looks_like_a_comment' that they wanted to include in cone mode,
that since '*' sorts after '(' and '%' that entries would appear in
sparse-checkout in the wrong order.  But you are comparing on the
directory name, not the full line that will be written to the
sparse-checkout file (i.e. without the trailing '/*'), so my worries
were unfounded and there's no actual problem here.

> +       string_list_remove_duplicates(&sl, 0);
> +
> +       for (i = 0; i < sl.nr; i++) {
> +               char *pattern = sl.items[i].string;
> +
> +               if (!strcmp(pattern, ""))
> +                       fprintf(fp, "/*\n!/*/*\n");
> +               else
> +                       fprintf(fp, "/%s/*\n!/%s/*/*\n", pattern, pattern);
> +       }
> +
> +       string_list_clear(&sl, 0);
> +
> +       hashmap_iter_init(&el->recursive_hashmap, &iter);
> +       while ((entry = hashmap_iter_next(&iter))) {
> +               char *pattern = xstrdup(entry->pattern);
> +               char *converted = pattern;
> +               if (pattern[0] == '/')
> +                       converted++;
> +               if (pattern[entry->patternlen - 1] == '/')
> +                       pattern[entry->patternlen - 1] = 0;
> +               string_list_insert(&sl, converted);
> +               free(pattern);
> +       }
> +
> +       string_list_sort(&sl);
> +       string_list_remove_duplicates(&sl, 0);
> +
> +       for (i = 0; i < sl.nr; i++) {
> +               char *pattern = sl.items[i].string;
> +               fprintf(fp, "/%s/*\n", pattern);
> +       }
> +}
> +
>  static int sparse_checkout_list(int argc, const char **argv)

Should the list mode in cone mode be modified to just show the
directories the user added?  It seems a little weird to show the
internal details of the implementation (all the parent directories and
negated entries and whatnot).  That's also not in a form that users
can pass along to future `sparse-checkout add` invocations.

In fact, maybe the implementation should be changed?  Perhaps we
should write out a .git/info/sparse-checkout-cone file instead with
just the wanted directories (with a trailing slash to permit future
addition of individual paths without bringing parents along)?  It'd be
easier on the parsing, and older git doesn't work with cone mode
anyway:

$ /usr/bin/git status
fatal: bad numeric config value 'cone' for 'core.sparsecheckout': invalid unit

Or, if we need to make old git work reasonably with cone mode, we
could write *both* .git/info/sparse-checkout-cone and
.git/info/sparse-checkout, but the latter begins with a comment
something akin to:

# DO NOT EDIT; auto-generated by 'git sparse-checkout' command and
only used by older git clients; see .git/info/sparse-checkout-clone
for active settings

>  {
>         struct exclude_list el;
> @@ -141,8 +202,21 @@ static int sparse_checkout_init(int argc, const char **argv)
>         char *sparse_filename;
>         FILE *fp;
>         int res;
> +       enum sparse_checkout_mode mode;
>
> -       if (sc_set_config(SPARSE_CHECKOUT_FULL))
> +       static struct option builtin_sparse_checkout_init_options[] = {
> +               OPT_BOOL(0, "cone", &opts.cone,
> +                        N_("initialize the sparse-checkout in cone mode")),
> +               OPT_END(),
> +       };
> +
> +       argc = parse_options(argc, argv, NULL,
> +                            builtin_sparse_checkout_init_options,
> +                            builtin_sparse_checkout_init_usage, 0);
> +
> +       mode = opts.cone ? SPARSE_CHECKOUT_CONE : SPARSE_CHECKOUT_FULL;
> +
> +       if (sc_set_config(mode))
>                 return 1;
>
>         memset(&el, 0, sizeof(el));
> @@ -183,6 +257,34 @@ static int sparse_checkout_init(int argc, const char **argv)
>         return sc_read_tree();
>  }
>
> +static void insert_recursive_pattern(struct exclude_list *el, struct strbuf *path)
> +{
> +       struct exclude_entry *e = xmalloc(sizeof(struct exclude_entry));
> +       e->patternlen = path->len;
> +       e->pattern = strbuf_detach(path, NULL);
> +       hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
> +
> +       hashmap_add(&el->recursive_hashmap, e);
> +
> +       while (e->patternlen) {
> +               char *slash = strrchr(e->pattern, '/');
> +               char *oldpattern = e->pattern;
> +               size_t newlen;
> +
> +               if (!slash)
> +                       break;
> +
> +               newlen = slash - e->pattern;
> +               e = xmalloc(sizeof(struct exclude_entry));
> +               e->patternlen = newlen;
> +               e->pattern = xstrndup(oldpattern, newlen);
> +               hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
> +
> +               if (!hashmap_get(&el->parent_hashmap, e, NULL))
> +                       hashmap_add(&el->parent_hashmap, e);
> +       }
> +}
> +
>  static int sparse_checkout_add(int argc, const char **argv)
>  {
>         struct exclude_list el;
> @@ -196,11 +298,33 @@ static int sparse_checkout_add(int argc, const char **argv)
>         add_excludes_from_file_to_list(sparse_filename, "", 0, &el, NULL);
>
>         fp = fopen(sparse_filename, "w");
> -       write_excludes_to_file(fp, &el);
>
> -       while (!strbuf_getline(&line, stdin)) {
> -               strbuf_trim(&line);
> -               fprintf(fp, "%s\n", line.buf);
> +       if (core_sparse_checkout == SPARSE_CHECKOUT_FULL) {
> +               write_excludes_to_file(fp, &el);
> +
> +               while (!strbuf_getline(&line, stdin)) {
> +                       strbuf_trim(&line);
> +                       fprintf(fp, "%s\n", line.buf);
> +               }
> +       } else if (core_sparse_checkout == SPARSE_CHECKOUT_CONE) {
> +               while (!strbuf_getline(&line, stdin)) {
> +                       strbuf_trim(&line);
> +
> +                       strbuf_trim_trailing_dir_sep(&line);
> +
> +                       if (!line.len)
> +                               continue;
> +
> +                       if (line.buf[0] == '/')
> +                               strbuf_remove(&line, 0, 1);
> +
> +                       if (!line.len)
> +                               continue;
> +
> +                       insert_recursive_pattern(&el, &line);
> +               }
> +
> +               write_cone_to_file(fp, &el);
>         }

No sanity checking that the user provided directory names rather than
globs or regexes?  Won't that silently do nothing, suggesting we need
to warn users?  Or are we just assuming that any 'glob' or 'regex'
characters they provide are actually meant as literal characters in a
pathname?

Also, what if they try to add an exclusion rule for a directory to
undo their add?  e.g.
  echo "!some/subdir/" | git sparse-checkout add
?

Is there a way to undo an add without undoing everything?  (Perhaps if
'git sparse-checkout list' showed just the added directories, it'd be
easy for users to take that output, delete one they don't want, and
then deal with undo and redoing.  Is that how we should handle it?)

>
>         fclose(fp);
> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> index 60f10864a1..3412bafdff 100755
> --- a/t/t1091-sparse-checkout-builtin.sh
> +++ b/t/t1091-sparse-checkout-builtin.sh
> @@ -157,4 +157,39 @@ test_expect_success 'sparse-checkout disable' '
>         test_cmp expect dir
>  '
>
> +test_expect_success 'cone mode: init and add' '
> +       git -C repo sparse-checkout init --cone &&
> +       git -C repo config --list >config &&
> +       test_i18ngrep "core.sparsecheckout=cone" config &&
> +       ls repo >dir  &&
> +       echo a >expect &&
> +       test_cmp expect dir &&
> +       echo deep/deeper1/deepest | git -C repo sparse-checkout add &&
> +       ls repo >dir  &&
> +       cat >expect <<-EOF &&
> +               a
> +               deep
> +       EOF
> +       ls repo/deep >dir  &&
> +       cat >expect <<-EOF &&
> +               a
> +               deeper1
> +       EOF
> +       ls repo/deep/deeper1 >dir  &&
> +       cat >expect <<-EOF &&
> +               a
> +               deepest
> +       EOF
> +       test_cmp expect dir &&
> +       cat >expect <<-EOF &&
> +               /*
> +               !/*/*
> +               /deep/*
> +               !/deep/*/*
> +               /deep/deeper1/*
> +               !/deep/deeper1/*/*
> +               /deep/deeper1/deepest/*
> +       EOF
> +       test_cmp expect repo/.git/info/sparse-checkout
> +'
>  test_done
> \ No newline at end of file
> --
> gitgitgadget

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode
  2019-08-22 13:10   ` Derrick Stolee
  2019-08-22 14:25     ` Derrick Stolee
@ 2019-08-24  5:40     ` Elijah Newren
  2019-08-26 13:29       ` Derrick Stolee
  2019-09-02 17:55       ` Eric Sunshine
  1 sibling, 2 replies; 196+ messages in thread
From: Elijah Newren @ 2019-08-24  5:40 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Derrick Stolee via GitGitGadget, Git Mailing List, Junio C Hamano

On Thu, Aug 22, 2019 at 6:10 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 8/21/2019 5:52 PM, Elijah Newren wrote:
> > On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
> > <gitgitgadget@gmail.com> wrote:

<snip>
> >> Here are some more specific details:
> >>
> >>  * git sparse-checkout init enables core.sparseCheckout and populates the
> >>    sparse-checkout file with patterns that match only the files at root.
> >
> > Does it enable core.sparseCheckout in the current worktree, or for all
> > worktrees?  Do we require extensions.worktreeConfig to be set to true
> > first?  If we don't require extensions.worktreeConfig to be set to
> > true, and users add worktrees later, do they encounter negative
> > surprises (immediately or later)?
>
> This is an interesting scenario that I had not considered. Thanks!
>
> My guess is that we should set `extensions.worktreeConfig=true` to
> avoid surprises. I'll need to play with this to discover the answers
> to these questions:
>
> 1. Where does the worktree look for the sparse-checkout file? Does
>    each worktree have its own sparse-checkout file? Should it?

For the main/first/primary worktree: .git/info/sparse-checkout
For all other worktrees: .git/worktrees/$WORKTREE/info/sparse-checkout

So, yes, each has its own, and from my viewpoint, absolutely yes that
is what we want.

> 2. If I have `extensions.worktreeConfig=true` and `core.sparseCheckout=true`
>    in the current worktree and run `git worktree add`, does the new worktree
>    have `core.sparseCheckout=true`? Can we `git clone --sparse` and then
>    start building sparse worktrees seamlessly?

My $0.02: I think `git worktree add` should not only adopt the setting
of core.sparseCheckout from the current worktree, but it should also
adopt the $GIT_DIR/info/sparse-checkout file too.  Granted, users can
change it to something else, but much like a new shell starts up with
the same current working directory as its parent shell, I think it'd
be most obvious for people to have a worktree that looked similar to
the one they launched it from.

<snip>
> > The default of reading from stdin seems a bit unusual to me, and I
> > worry about having to explain that to users.  I'd rather the add
> > command took positional parameters (anything that doesn't start with a
> > hyphen) and added those, e.g.
> >   $ git sparse-checkout add '/myFolder/*' '
> > with the option of the user specifying --stdin.
>
> I had the same thought, and likely that's where we should go with the
> builtin. For our needs, the input over stdin is more important for
> testing, so I built it first. I will adjust the CLI here to take a set
> of paths over the arguments unless --stdin is given.
>
> >>  * git sparse-checkout disable removes the patterns from the sparse-checkout
> >>    file, disables core.sparseCheckout, and refills the working directory.
> >
> > Does it leave an empty sparse-checkout file around?  Also, what if
> > users have several paths defining their sparse pattern, and want to
> > temporarily get a full checkout and then come back -- do they need to
> > re-specify all the paths?  (Maybe this *is* the route we want to go;
> > I'm just trying to mention any possible negative effects we _might_
> > run into so we can consider them.  It's not quite as relevant in my
> > case since people specify a few toplevel modules and sparse-checkout
> > gets several entries auto-generated for them.)
>
> In this case, there is an intermediate step (that follows the existing
> advice) to modify the sparse-checkout file to contain only "/*\n" then
> run read-tree to fill the working directory before disabling the config
> setting.
>
> Perhaps "disable" is the wrong word to use, as it makes you think that
> there should be an "enable" that can quickly toggle between the two
> modes. Maybe instead it should be "git sparse-checkout reset [empty|full]"
> where you could 'reset' the sparse-checkout to one of two initial
> states:
>
> 1. empty: only files at root are included.
> 2. full: all files are included.
>
> In each case, we would obliterate the existing sparse-checkout entries,
> but hopefully that behavior is more clear from the command names.

Will "reset" be seen as slightly less obvious wording that needs to be
explained to users?  If so, maybe have "undo" and "empty" verbs?  (Of
course, "init" already empties, both when starting from full or when
we have some kind of sparse checkout.)  I dunno, just some ideas.

> >> In cone mode, a user specifies a list of folders which the user wants every
> >> file inside. In addition, the cone adds all blobs that are siblings of the
> >> folders in the directory path to that folder. This makes the directories
> >> look "hydrated" as a user drills down to those recursively-closed folders.
> >> These directories are called "parent" folders, as a file matches them only
> >> if the file's immediate parent is that directory.
> >>
> >> When building a prototype of this feature, I used a separate file to contain
> >> the list of recursively-closed folders and built the hashsets dynamically
> >> based on that file. In this implementation, I tried to maximize the amount
> >> of backwards-compatibility by storing all data in the sparse-checkout file
> >> using patterns recognized by earlier Git versions.
> >>
> >> For example, if we add A/B/C as a recursive folder, then we add the
> >> following patterns to the sparse-checkout file:
> >>
> >> /*
> >> !/*/*
> >> /A/*
> >> !/A/*/*
> >> /A/B/*
> >> !/A/B/*/*
> >> /A/B/C/*
> >>
> >> The alternating positive/negative patterns say "include everything in this
> >> folder, but exclude everything another level deeper". The final pattern has
> >> no matching negation, so is a recursively closed pattern.
> >
> > Oh, um, would there be any option for fast but without grabbing
> > sibling and parent files of requested directories?  And could users
> > still request individual files (not with regex or pathspec, but fully
> > specifying the path) and still get the fast mode?
>
> Exact files could probably be included and still be fast. It requires an
> extra hash check per entry, but that's a small price to pay I think.
>
> With the sibling files, this is something I believe to be user-friendly:
> as a user drills down into the folder they included recursively, there may
> be helpful files along the way, like documentation, project files, etc.
>
> Here is my philosophical position here: a repo can take advantage of the
> sparse-checkout feature if it is properly componetized. Those component
> boundaries are likely at folder boundaries. Any file that exists in a parent
> folder for two components is likely important to _both_ components. If
> a file is large and is not needed by both components, it should be placed
> deeper in the tree, so it can be avoided.
>
> With that philosophy in mind, I designed this to help users fall into the
> "pit of success" when their repo is in a good shape AND to motivate users
> with repos in non-optimal shapes to reorganize.
>
> The thought I had about exact file names is similar: if there is a large
> list of files in a folder where I only need a subset, then how do I know
> if a new file is added that I need? It will not show up in the directory
> without updating the sparse-checkout. A user would discover this need by
> something going wrong when they are not interacting with version control:
> a build.
>
> This is particularly important with the root directory. We need things
> like .gitignore, .gitattributes, README, LICENSE, etc. to be populated
> by default. If there are too many files at root to reasonably work with
> the repo, then the repo should be reorganized using folders.
>
> > Basically, our sparse usage is exclusively specifying leading
> > directories or full pathnames of individual files, but we really want
> > the repo to feel smaller and make sure people notice at a glance.  We
> > have a huge 'modules/' directory, and want people to be able to get
> > just 15 of the 500 or so subdirectories that would appear in that
> > directory with a non-sparse checkout.  And similarly we want to be
> > able to grab just one or two files from a directory of many files.
>
> Your modules/ example seems to work with the feature as designed, as
> you want a set of folders one level deeper. Grabbing one or two files
> from a directory is a direction we can go with the feature, but I will
> continue to believe that should be a rare occurrence compared to including
> a folder recursively.

Oh, you're right, I was misunderstanding what it'd do.  This does look
like it's really close to what we're using, and most of the
differences are probably worth some slightly reshuffling of paths in
the repo.  Now that I've played with it some, it seems really awesome.

Being able to grab one or two files from a directory without grabbing
an entire directory and its parents I think would probably still be
useful, but I do agree that it'd be a rare occurrence.

> >> Note that I have some basic warnings to try and check that the
> >> sparse-checkout file doesn't match what would be written by a cone-mode add.
> >> In such a case, Git writes a warning to stderr and continues with the old
> >> pattern matching algorithm. These checks are currently very barebones, and
> >> would need to be updated with more robust checks for things like regex
> >> characters in the middle of the pattern. As review moves forward (and if we
> >> don't change the data storage) then we could spend more time on this.
> >
> > Instead of trying to validate the sparse-checkout file everytime,
> > perhaps we want to change core.sparseCheckout from a boolean to a
> > tri-state or something where it specifies how to parse the
> > sparse-checkout file?  Or maybe when special directive (some form of
> > comment-looking line) appears at the top of sparse-checkout then we
> > use the hashsets speedup while disallowing general regexes and
> > pathspecs other than leading directories and full pathnames?
>
> In this series, I turn `core.sparseCheckout` into a tri-state, and only
> try to validate the sparse-checkout when `core.sparseCheckout=cone`.
> This avoids spending time on the validation when someone is content using
> the existing feature.
>
> The _intent_ of using the sparse-checkout file and no extra data structure
> was to let other clients (or an older client) read the sparse-checkout data
> and result in the same working directory. One thing I realized after
> submitting is that the tri-state config variable will cause old clients
> to error on parsing the non-boolean value. Instead, in v2 I will introduce
> a new boolean config variable "core.sparseCheckoutCone" that will do the
> same thing as the current series when `core.sparseCheckout=cone` and will
> fix this compat scenario.

Once we are forced to use yet another config variable, we may as well
use yet another config file ($GITDIR/info/sparse-checkout-cone or
something; or maybe a less specific name with greater future
compatibility via some version marking in it).

One thing I noticed twice while using this series was that when I had
an existing sparse checkout it was easy to get into a weird state
where things were messed up, I think due to the fact that
"sparse-checkout init [--cone]" prefers to honor any pre-existing
$GITDIR/info/sparse-checkout file.  Once my config file was very much
not cone-compatible, and another time it was empty and caused
read-tree to error out with something like "there'd be nothing left!".
I manually twiddled with core.sparseCheckout and the sparse-checkout
file and 'git read-tree -mu HEAD' to get it fixed, but I'd rather
avoid others running into such problems.  Sorry I didn't take good
notes on it; I was just trying to get a good feel for this series.

> > I'll try to get some time to look over these patches in the next few days.
>
> I look forward to your feedback! I also have some feedback to respond to
> from my team [1], but I'm waiting to make sure the community likes the
> overall idea before jumping into code style and method organization
> details.

I think this idea is great; I'm a big fan right now.  I'm excited to
see how this will pan out.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode
  2019-08-24  5:40     ` Elijah Newren
@ 2019-08-26 13:29       ` Derrick Stolee
  2019-08-26 18:16         ` Elijah Newren
  2019-09-02 17:55       ` Eric Sunshine
  1 sibling, 1 reply; 196+ messages in thread
From: Derrick Stolee @ 2019-08-26 13:29 UTC (permalink / raw)
  To: Elijah Newren
  Cc: Derrick Stolee via GitGitGadget, Git Mailing List, Junio C Hamano

On 8/24/2019 1:40 AM, Elijah Newren wrote:
> On Thu, Aug 22, 2019 at 6:10 AM Derrick Stolee <stolee@gmail.com> wrote:
>>
>> On 8/21/2019 5:52 PM, Elijah Newren wrote:
>>> On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
>>> <gitgitgadget@gmail.com> wrote:
> 
> <snip>
>>>> Here are some more specific details:
>>>>
>>>>  * git sparse-checkout init enables core.sparseCheckout and populates the
>>>>    sparse-checkout file with patterns that match only the files at root.
>>>
>>> Does it enable core.sparseCheckout in the current worktree, or for all
>>> worktrees?  Do we require extensions.worktreeConfig to be set to true
>>> first?  If we don't require extensions.worktreeConfig to be set to
>>> true, and users add worktrees later, do they encounter negative
>>> surprises (immediately or later)?
>>
>> This is an interesting scenario that I had not considered. Thanks!
>>
>> My guess is that we should set `extensions.worktreeConfig=true` to
>> avoid surprises. I'll need to play with this to discover the answers
>> to these questions:
>>
>> 1. Where does the worktree look for the sparse-checkout file? Does
>>    each worktree have its own sparse-checkout file? Should it?
> 
> For the main/first/primary worktree: .git/info/sparse-checkout
> For all other worktrees: .git/worktrees/$WORKTREE/info/sparse-checkout
> 
> So, yes, each has its own, and from my viewpoint, absolutely yes that
> is what we want.

Thanks for the info! I will definitely consider this in the next version,
and include a test to verify the interaction.

>> 2. If I have `extensions.worktreeConfig=true` and `core.sparseCheckout=true`
>>    in the current worktree and run `git worktree add`, does the new worktree
>>    have `core.sparseCheckout=true`? Can we `git clone --sparse` and then
>>    start building sparse worktrees seamlessly?
> 
> My $0.02: I think `git worktree add` should not only adopt the setting
> of core.sparseCheckout from the current worktree, but it should also
> adopt the $GIT_DIR/info/sparse-checkout file too.  Granted, users can
> change it to something else, but much like a new shell starts up with
> the same current working directory as its parent shell, I think it'd
> be most obvious for people to have a worktree that looked similar to
> the one they launched it from.

This seems natural to me: I'm adding a new worktree and expect the
settings to match my current worktree. If we later want to extend
`git worktree add` to include an `--empty-cone` option (that creates
the worktree as if it was created by `git clone --sparse-cone`) we
could do that independently.

> <snip>
>>> The default of reading from stdin seems a bit unusual to me, and I
>>> worry about having to explain that to users.  I'd rather the add
>>> command took positional parameters (anything that doesn't start with a
>>> hyphen) and added those, e.g.
>>>   $ git sparse-checkout add '/myFolder/*' '
>>> with the option of the user specifying --stdin.
>>
>> I had the same thought, and likely that's where we should go with the
>> builtin. For our needs, the input over stdin is more important for
>> testing, so I built it first. I will adjust the CLI here to take a set
>> of paths over the arguments unless --stdin is given.
>>
>>>>  * git sparse-checkout disable removes the patterns from the sparse-checkout
>>>>    file, disables core.sparseCheckout, and refills the working directory.
>>>
>>> Does it leave an empty sparse-checkout file around?  Also, what if
>>> users have several paths defining their sparse pattern, and want to
>>> temporarily get a full checkout and then come back -- do they need to
>>> re-specify all the paths?  (Maybe this *is* the route we want to go;
>>> I'm just trying to mention any possible negative effects we _might_
>>> run into so we can consider them.  It's not quite as relevant in my
>>> case since people specify a few toplevel modules and sparse-checkout
>>> gets several entries auto-generated for them.)
>>
>> In this case, there is an intermediate step (that follows the existing
>> advice) to modify the sparse-checkout file to contain only "/*\n" then
>> run read-tree to fill the working directory before disabling the config
>> setting.
>>
>> Perhaps "disable" is the wrong word to use, as it makes you think that
>> there should be an "enable" that can quickly toggle between the two
>> modes. Maybe instead it should be "git sparse-checkout reset [empty|full]"
>> where you could 'reset' the sparse-checkout to one of two initial
>> states:
>>
>> 1. empty: only files at root are included.
>> 2. full: all files are included.
>>
>> In each case, we would obliterate the existing sparse-checkout entries,
>> but hopefully that behavior is more clear from the command names.
> 
> Will "reset" be seen as slightly less obvious wording that needs to be
> explained to users?  If so, maybe have "undo" and "empty" verbs?  (Of
> course, "init" already empties, both when starting from full or when
> we have some kind of sparse checkout.)  I dunno, just some ideas.

Thanks for pointing out that my word choice could be improved. I'll
consider several options.

>>>> In cone mode, a user specifies a list of folders which the user wants every
>>>> file inside. In addition, the cone adds all blobs that are siblings of the
>>>> folders in the directory path to that folder. This makes the directories
>>>> look "hydrated" as a user drills down to those recursively-closed folders.
>>>> These directories are called "parent" folders, as a file matches them only
>>>> if the file's immediate parent is that directory.
>>>>
>>>> When building a prototype of this feature, I used a separate file to contain
>>>> the list of recursively-closed folders and built the hashsets dynamically
>>>> based on that file. In this implementation, I tried to maximize the amount
>>>> of backwards-compatibility by storing all data in the sparse-checkout file
>>>> using patterns recognized by earlier Git versions.
>>>>
>>>> For example, if we add A/B/C as a recursive folder, then we add the
>>>> following patterns to the sparse-checkout file:
>>>>
>>>> /*
>>>> !/*/*
>>>> /A/*
>>>> !/A/*/*
>>>> /A/B/*
>>>> !/A/B/*/*
>>>> /A/B/C/*
>>>>
>>>> The alternating positive/negative patterns say "include everything in this
>>>> folder, but exclude everything another level deeper". The final pattern has
>>>> no matching negation, so is a recursively closed pattern.
>>>
>>> Oh, um, would there be any option for fast but without grabbing
>>> sibling and parent files of requested directories?  And could users
>>> still request individual files (not with regex or pathspec, but fully
>>> specifying the path) and still get the fast mode?
>>
>> Exact files could probably be included and still be fast. It requires an
>> extra hash check per entry, but that's a small price to pay I think.
>>
>> With the sibling files, this is something I believe to be user-friendly:
>> as a user drills down into the folder they included recursively, there may
>> be helpful files along the way, like documentation, project files, etc.
>>
>> Here is my philosophical position here: a repo can take advantage of the
>> sparse-checkout feature if it is properly componetized. Those component
>> boundaries are likely at folder boundaries. Any file that exists in a parent
>> folder for two components is likely important to _both_ components. If
>> a file is large and is not needed by both components, it should be placed
>> deeper in the tree, so it can be avoided.
>>
>> With that philosophy in mind, I designed this to help users fall into the
>> "pit of success" when their repo is in a good shape AND to motivate users
>> with repos in non-optimal shapes to reorganize.
>>
>> The thought I had about exact file names is similar: if there is a large
>> list of files in a folder where I only need a subset, then how do I know
>> if a new file is added that I need? It will not show up in the directory
>> without updating the sparse-checkout. A user would discover this need by
>> something going wrong when they are not interacting with version control:
>> a build.
>>
>> This is particularly important with the root directory. We need things
>> like .gitignore, .gitattributes, README, LICENSE, etc. to be populated
>> by default. If there are too many files at root to reasonably work with
>> the repo, then the repo should be reorganized using folders.
>>
>>> Basically, our sparse usage is exclusively specifying leading
>>> directories or full pathnames of individual files, but we really want
>>> the repo to feel smaller and make sure people notice at a glance.  We
>>> have a huge 'modules/' directory, and want people to be able to get
>>> just 15 of the 500 or so subdirectories that would appear in that
>>> directory with a non-sparse checkout.  And similarly we want to be
>>> able to grab just one or two files from a directory of many files.
>>
>> Your modules/ example seems to work with the feature as designed, as
>> you want a set of folders one level deeper. Grabbing one or two files
>> from a directory is a direction we can go with the feature, but I will
>> continue to believe that should be a rare occurrence compared to including
>> a folder recursively.
> 
> Oh, you're right, I was misunderstanding what it'd do.  This does look
> like it's really close to what we're using, and most of the
> differences are probably worth some slightly reshuffling of paths in
> the repo.  Now that I've played with it some, it seems really awesome.
> 
> Being able to grab one or two files from a directory without grabbing
> an entire directory and its parents I think would probably still be
> useful, but I do agree that it'd be a rare occurrence.

I think we can leave the file-by-file addition for later, but may need
to make certain design decisions in this initial version to avoid issues
with adding that feature in the future. (Perhaps the recursive-folder input
should have "/" at the end, to clearly state these are folders, not files.)

>>>> Note that I have some basic warnings to try and check that the
>>>> sparse-checkout file doesn't match what would be written by a cone-mode add.
>>>> In such a case, Git writes a warning to stderr and continues with the old
>>>> pattern matching algorithm. These checks are currently very barebones, and
>>>> would need to be updated with more robust checks for things like regex
>>>> characters in the middle of the pattern. As review moves forward (and if we
>>>> don't change the data storage) then we could spend more time on this.
>>>
>>> Instead of trying to validate the sparse-checkout file everytime,
>>> perhaps we want to change core.sparseCheckout from a boolean to a
>>> tri-state or something where it specifies how to parse the
>>> sparse-checkout file?  Or maybe when special directive (some form of
>>> comment-looking line) appears at the top of sparse-checkout then we
>>> use the hashsets speedup while disallowing general regexes and
>>> pathspecs other than leading directories and full pathnames?
>>
>> In this series, I turn `core.sparseCheckout` into a tri-state, and only
>> try to validate the sparse-checkout when `core.sparseCheckout=cone`.
>> This avoids spending time on the validation when someone is content using
>> the existing feature.
>>
>> The _intent_ of using the sparse-checkout file and no extra data structure
>> was to let other clients (or an older client) read the sparse-checkout data
>> and result in the same working directory. One thing I realized after
>> submitting is that the tri-state config variable will cause old clients
>> to error on parsing the non-boolean value. Instead, in v2 I will introduce
>> a new boolean config variable "core.sparseCheckoutCone" that will do the
>> same thing as the current series when `core.sparseCheckout=cone` and will
>> fix this compat scenario.
> 
> Once we are forced to use yet another config variable, we may as well
> use yet another config file ($GITDIR/info/sparse-checkout-cone or
> something; or maybe a less specific name with greater future
> compatibility via some version marking in it).

I'm hesitant to include a second "source of truth," as that can cause
issues when users modify the sparse-checkout file directly. Since the
existing way to interact with the sparse-checkout is to directly edit
the file, I want to be as careful as possible around users who modify
that themselves. The caveat is that if they specify "cone" mode then
they will get warnings and worse performance if they modify it outside
the limited patterns we allow.

> One thing I noticed twice while using this series was that when I had
> an existing sparse checkout it was easy to get into a weird state
> where things were messed up, I think due to the fact that
> "sparse-checkout init [--cone]" prefers to honor any pre-existing
> $GITDIR/info/sparse-checkout file.  Once my config file was very much
> not cone-compatible, and another time it was empty and caused
> read-tree to error out with something like "there'd be nothing left!".
> I manually twiddled with core.sparseCheckout and the sparse-checkout
> file and 'git read-tree -mu HEAD' to get it fixed, but I'd rather
> avoid others running into such problems.  Sorry I didn't take good
> notes on it; I was just trying to get a good feel for this series.

Thanks for this interesting use case! I think an empty file should be
updated with the root files, since Git does not think that is a valid
state. The current series must only check for existence, not content.
 
>>> I'll try to get some time to look over these patches in the next few days.
>>
>> I look forward to your feedback! I also have some feedback to respond to
>> from my team [1], but I'm waiting to make sure the community likes the
>> overall idea before jumping into code style and method organization
>> details.
> 
> I think this idea is great; I'm a big fan right now.  I'm excited to
> see how this will pan out.

Thanks! I'll be taking a close look at your patch-by-patch feedback
this week and hope to have a non-RFC v2 soon.

-Stolee


^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode
  2019-08-26 13:29       ` Derrick Stolee
@ 2019-08-26 18:16         ` Elijah Newren
  2019-08-26 19:16           ` Derrick Stolee
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-08-26 18:16 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Derrick Stolee via GitGitGadget, Git Mailing List, Junio C Hamano

On Mon, Aug 26, 2019 at 6:29 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 8/24/2019 1:40 AM, Elijah Newren wrote:
> > On Thu, Aug 22, 2019 at 6:10 AM Derrick Stolee <stolee@gmail.com> wrote:
> >>
> >> On 8/21/2019 5:52 PM, Elijah Newren wrote:
> >>> On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
> >>> <gitgitgadget@gmail.com> wrote:
> >

> >> In this series, I turn `core.sparseCheckout` into a tri-state, and only
> >> try to validate the sparse-checkout when `core.sparseCheckout=cone`.
> >> This avoids spending time on the validation when someone is content using
> >> the existing feature.
> >>
> >> The _intent_ of using the sparse-checkout file and no extra data structure
> >> was to let other clients (or an older client) read the sparse-checkout data
> >> and result in the same working directory. One thing I realized after
> >> submitting is that the tri-state config variable will cause old clients
> >> to error on parsing the non-boolean value. Instead, in v2 I will introduce
> >> a new boolean config variable "core.sparseCheckoutCone" that will do the
> >> same thing as the current series when `core.sparseCheckout=cone` and will
> >> fix this compat scenario.
> >
> > Once we are forced to use yet another config variable, we may as well
> > use yet another config file ($GITDIR/info/sparse-checkout-cone or
> > something; or maybe a less specific name with greater future
> > compatibility via some version marking in it).
>
> I'm hesitant to include a second "source of truth," as that can cause
> issues when users modify the sparse-checkout file directly. Since the
> existing way to interact with the sparse-checkout is to directly edit
> the file, I want to be as careful as possible around users who modify
> that themselves. The caveat is that if they specify "cone" mode then
> they will get warnings and worse performance if they modify it outside
> the limited patterns we allow.

Wait...does that mean you allow mixing and matching both regular style
sparse-checkout declarations with cone-mode style declarations within
the same file?  Are the non-cone mode entries ignored?  Does it
fallback to non-cone mode for all entries?  Or does that mean you
allow checking out both old and new styles of filesets, where you
optimize the cone-mode style declarations with your hashsets, and have
the remaining ones fall back to the old O(N*M) matching?  (I think it
does the last of those, right?)

If you support both, it sounds like you naturally support doing cone
mode with allowing people to also grab a handful of additional files
without the rest of their directories or parents.  It's just that
folks who want to do that will ask for a way to turn off any warnings
you spew, and if you turn the warnings off, then people who meant to
get cone behavior but mistyped stuff might complain about no warnings.
Hmm....

(Just trying to think things through out loud; I don't necessarily
know what's good or bad here, just voicing what I think might happen.)

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode
  2019-08-26 18:16         ` Elijah Newren
@ 2019-08-26 19:16           ` Derrick Stolee
  0 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-08-26 19:16 UTC (permalink / raw)
  To: Elijah Newren
  Cc: Derrick Stolee via GitGitGadget, Git Mailing List, Junio C Hamano

On 8/26/2019 2:16 PM, Elijah Newren wrote:
> On Mon, Aug 26, 2019 at 6:29 AM Derrick Stolee <stolee@gmail.com> wrote:
>>
>> On 8/24/2019 1:40 AM, Elijah Newren wrote:
>>> On Thu, Aug 22, 2019 at 6:10 AM Derrick Stolee <stolee@gmail.com> wrote:
>>>>
>>>> On 8/21/2019 5:52 PM, Elijah Newren wrote:
>>>>> On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
>>>>> <gitgitgadget@gmail.com> wrote:
>>>
> 
>>>> In this series, I turn `core.sparseCheckout` into a tri-state, and only
>>>> try to validate the sparse-checkout when `core.sparseCheckout=cone`.
>>>> This avoids spending time on the validation when someone is content using
>>>> the existing feature.
>>>>
>>>> The _intent_ of using the sparse-checkout file and no extra data structure
>>>> was to let other clients (or an older client) read the sparse-checkout data
>>>> and result in the same working directory. One thing I realized after
>>>> submitting is that the tri-state config variable will cause old clients
>>>> to error on parsing the non-boolean value. Instead, in v2 I will introduce
>>>> a new boolean config variable "core.sparseCheckoutCone" that will do the
>>>> same thing as the current series when `core.sparseCheckout=cone` and will
>>>> fix this compat scenario.
>>>
>>> Once we are forced to use yet another config variable, we may as well
>>> use yet another config file ($GITDIR/info/sparse-checkout-cone or
>>> something; or maybe a less specific name with greater future
>>> compatibility via some version marking in it).
>>
>> I'm hesitant to include a second "source of truth," as that can cause
>> issues when users modify the sparse-checkout file directly. Since the
>> existing way to interact with the sparse-checkout is to directly edit
>> the file, I want to be as careful as possible around users who modify
>> that themselves. The caveat is that if they specify "cone" mode then
>> they will get warnings and worse performance if they modify it outside
>> the limited patterns we allow.
> 
> Wait...does that mean you allow mixing and matching both regular style
> sparse-checkout declarations with cone-mode style declarations within
> the same file?  Are the non-cone mode entries ignored?  Does it
> fallback to non-cone mode for all entries?  Or does that mean you
> allow checking out both old and new styles of filesets, where you
> optimize the cone-mode style declarations with your hashsets, and have
> the remaining ones fall back to the old O(N*M) matching?  (I think it
> does the last of those, right?)
> 
> If you support both, it sounds like you naturally support doing cone
> mode with allowing people to also grab a handful of additional files
> without the rest of their directories or parents.  It's just that
> folks who want to do that will ask for a way to turn off any warnings
> you spew, and if you turn the warnings off, then people who meant to
> get cone behavior but mistyped stuff might complain about no warnings.
> Hmm....
> 
> (Just trying to think things through out loud; I don't necessarily
> know what's good or bad here, just voicing what I think might happen.)

The way I built the current series is that we honor what is in the
sparse-checkout as historically allowed. Always.

If a user modifies the sparse-checkout to have patterns that don't match
those that are added in cone mode, then Git warns the user this is the
case then reverts to the old pattern-by-pattern logic. This is to have
the Git client always match what another Git client would expect. (This
could be JGit or an older version of Git.) A user could always disable
cone mode to remove the warning and keep their sparse-checkout in its
current state.

Note: I have not made the "non-code-mode pattern" checks very robust.
For instance, I don't check the middle characters for wildcards. This
needs to happen at write time, too. The plan is to make these more
robust in future versions of the series.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode
  2019-08-24  5:40     ` Elijah Newren
  2019-08-26 13:29       ` Derrick Stolee
@ 2019-09-02 17:55       ` Eric Sunshine
  1 sibling, 0 replies; 196+ messages in thread
From: Eric Sunshine @ 2019-09-02 17:55 UTC (permalink / raw)
  To: Elijah Newren
  Cc: Derrick Stolee, Derrick Stolee via GitGitGadget,
	Git Mailing List, Junio C Hamano

On Sat, Aug 24, 2019 at 1:40 AM Elijah Newren <newren@gmail.com> wrote:
> My $0.02: I think `git worktree add` should not only adopt the setting
> of core.sparseCheckout from the current worktree, but it should also
> adopt the $GIT_DIR/info/sparse-checkout file too.

As another example in favor of imbuing "git worktree add" with
first-class support for this feature (via command-line option and/or
inheriting existing settings), the commit message of ef2a0ac9a0
(worktree: add: introduce --checkout option, 2016-03-29) specifically
sites sparse checkout as the motivation for adding --no-checkout to
"git worktree add".

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 2/9] sparse-checkout: create 'init' subcommand
  2019-08-23 23:02   ` Elijah Newren
@ 2019-09-11 14:27     ` Derrick Stolee
  2019-09-11 20:28     ` Derrick Stolee
  1 sibling, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-09-11 14:27 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On 8/23/2019 7:02 PM, Elijah Newren wrote:
> On Tue, Aug 20, 2019 at 8:13 AM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> Getting started with a sparse-checkout file can be daunting. Help
>> users start their sparse enlistment using 'git sparse-checkout init'.
>> This will set 'core.sparseCheckout=true' in their config, write
>> an initial set of patterns to the sparse-checkout file, and update
>> their working directory.
>>
>> Using 'git read-tree' to clear directories does not work cleanly
>> on Windows, so manually delete directories that are tracked by Git
>> before running read-tree.
> 
> Is that a bug in read-tree that needs to be fixed?

Just to follow up on this: it turns out that this is NOT a bug in
read-tree, but rather a side-effect of our custom "core.gvfs" config
setting. In the virtualized world, we didn't want Git to hard-delete
a folder just because we marked everything sparse.

By removing that option from my environment, the deletions work as
expected.

Thanks,
-Stolee


^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 2/9] sparse-checkout: create 'init' subcommand
  2019-08-23 23:02   ` Elijah Newren
  2019-09-11 14:27     ` Derrick Stolee
@ 2019-09-11 20:28     ` Derrick Stolee
  1 sibling, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-09-11 20:28 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On 8/23/2019 7:02 PM, Elijah Newren wrote:
> On Tue, Aug 20, 2019 at 8:13 AM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>> +static int sc_read_tree(void)
>> +{
>> +       struct argv_array argv = ARGV_ARRAY_INIT;
>> +       int result = 0;
>> +       argv_array_pushl(&argv, "read-tree", "-m", "-u", "HEAD", NULL);
>> +
>> +       if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
>> +               error(_("failed to update index with new sparse-checkout paths"));
>> +               result = 1;
>> +       }
> 
> `git read-tree -m -u HEAD` will fail if the index has any higher stage
> entries in it, even if those higher stage entries correspond to files
> which are included in the sparseness patterns and thus would not need
> an update.  It might be nice if we can find a way to provide a better
> error message, and/or implement the read-tree -m -u HEAD internally in
> a way that will allow us to not fail if the conflicted files are
> included in the sparse set.

I agree that this is not the _best_ thing to do, but it does mimic the
current recommendation for a user interacting with sparse-checkout.

I'll rename this helper to something like "update_working_directory()"
so it can be swapped with a different implementation later, after we
work out those usability kinks.

The other thing that is needed here: allow reverting the sparse-checkout
settings if this fails. I'll isolate that to a new commit so we can
examine that behavior carefully.

> 
>> +
>> +       argv_array_clear(&argv);
>> +       return result;
>> +}
>> +
>> +static int sc_enable_config(void)
>> +{
>> +       struct argv_array argv = ARGV_ARRAY_INIT;
>> +       int result = 0;
>> +       argv_array_pushl(&argv, "config", "--add", "core.sparseCheckout", "true", NULL);
> > Why --add?  That seems really odd to me.

Yeah, that's a mistake. Good find.

> 
> This should also have "--worktree".  And this function should either
> set extensions.worktreeConfig to true or die if it isn't already set;
> not sure which.  There's some UI and documentation stuff to figure out
> here...

I was planning to switch my `git config` subcommand to use in-process
methods, but I'm struggling to find a way to ensure we follow the
`--worktree` option. It likely would work if extensions.worktreeConfig
was enabled when the process starts, but adding it in-process likely
causes a problem.

> 
>> +
>> +       if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
>> +               error(_("failed to enable core.sparseCheckout"));
>> +               result = 1;
>> +       }
>> +
>> +       argv_array_clear(&argv);
>> +       return result;
>> +}
>> +
>> +static int delete_directory(const struct object_id *oid, struct strbuf *base,
>> +               const char *pathname, unsigned mode, int stage, void *context)
>> +{
>> +       struct strbuf dirname = STRBUF_INIT;
>> +       struct stat sb;
>> +
>> +       strbuf_addstr(&dirname, the_repository->worktree);
>> +       strbuf_addch(&dirname, '/');
>> +       strbuf_addstr(&dirname, pathname);
>> +
>> +       if (stat(dirname.buf, &sb) || !(sb.st_mode & S_IFDIR))
>> +               return 0;
>> +
>> +       if (remove_dir_recursively(&dirname, 0))
> 
> flags = 0 implies not REMOVE_DIR_EMPTY_ONLY.  I'm not familiar with
> remove_dir_recursively(), but won't this delete everything...including
> untracked files?  If so, that sounds like a bug.
This whole thing isn't needed any more, since read-tree does the right
thing.

> 
>> +               warning(_("failed to remove directory '%s'"),
>> +                       dirname.buf);
>> +
>> +       strbuf_release(&dirname);
>> +       return 0;
>> +}
>> +
>> +static int sparse_checkout_init(int argc, const char **argv)
>> +{
>> +       struct tree *t;
>> +       struct object_id oid;
>> +       struct exclude_list el;
>> +       static struct pathspec pathspec;
>> +       char *sparse_filename;
>> +       FILE *fp;
>> +       int res;
>> +
>> +       if (sc_enable_config())
>> +               return 1;
>> +
>> +       memset(&el, 0, sizeof(el));
>> +
>> +       sparse_filename = get_sparse_checkout_filename();
>> +       res = add_excludes_from_file_to_list(sparse_filename, "", 0, &el, NULL);
> 
> But 'el' isn't used again?  Why are we getting the list of files from
> sparse_filename then?

This is the only way I could think to check that the sparse-checkout file parses well without just doing the file open myself. Maybe we only need to check if the file exists (and is not empty).

>> +
>> +       /* If we already have a sparse-checkout file, use it. */
>> +       if (res >= 0) {
>> +               free(sparse_filename);
>> +               goto reset_dir;
>> +       }
>> +
>> +       /* initial mode: all blobs at root */
>> +       fp = fopen(sparse_filename, "w");
>> +       free(sparse_filename);
>> +       fprintf(fp, "/*\n!/*/*\n");
>> +       fclose(fp);
> 
> Makes sense.
> 
>> +
>> +       /* remove all directories in the root, if tracked by Git */
>> +       if (get_oid("HEAD", &oid)) {
>> +               /* assume we are in a fresh repo */
>> +               return 0;
>> +       }
>> +
>> +       t = parse_tree_indirect(&oid);
>> +
>> +       parse_pathspec(&pathspec, PATHSPEC_ALL_MAGIC &
>> +                                 ~(PATHSPEC_FROMTOP | PATHSPEC_LITERAL),
>> +                      PATHSPEC_PREFER_CWD,
>> +                      "", NULL);
>> +
>> +       if (read_tree_recursive(the_repository, t, "", 0, 0, &pathspec,
>> +                               delete_directory, NULL))
>> +               return 1;
> 
> Since this is only needed on Windows, as per your commit message,
> should it be #ifdef'd?  Or is this actually a bug that should be fixed
> in "git read-tree -mu HEAD"?

(this will not be needed, but thanks!)
 
>> +
>> +reset_dir:
>> +       return sc_read_tree();
>> +}
>> +
> 
> The rest looks fine.

Thanks,
-Stolee


^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 3/9] clone: add --sparse mode
  2019-08-23 23:17   ` Elijah Newren
@ 2019-09-18 13:51     ` Derrick Stolee
  0 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-09-18 13:51 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On 8/23/2019 7:17 PM, Elijah Newren wrote:
> On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> When someone wants to clone a large repository, but plans to work
>> using a sparse-checkout file, they either need to do a full
>> checkout first and then reduce the patterns they included, or
>> clone with --no-checkout, set up their patterns, and then run
>> a checkout manually. This requires knowing a lot about the repo
>> shape and how sparse-checkout works.
>>
>> Add a new '--sparse' option to 'git clone' that initializes the
>> sparse-checkout file to include the following patterns:
>>
>>         /*
>>         !/*/*
>>
>> These patterns include every file in the root directory, but
>> no directories. This allows a repo to include files like a
>> README or a bootstrapping script to grow enlistments from that
>> point.
> 
> Nice.
> 
>>
>> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
>> ---
>>  Documentation/git-clone.txt        |  8 +++++++-
>>  builtin/clone.c                    | 27 +++++++++++++++++++++++++++
>>  t/t1091-sparse-checkout-builtin.sh | 13 +++++++++++++
>>  3 files changed, 47 insertions(+), 1 deletion(-)
>>
>> diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt
>> index 34011c2940..0fe91d2f04 100644
>> --- a/Documentation/git-clone.txt
>> +++ b/Documentation/git-clone.txt
>> @@ -15,7 +15,7 @@ SYNOPSIS
>>           [--dissociate] [--separate-git-dir <git dir>]
>>           [--depth <depth>] [--[no-]single-branch] [--no-tags]
>>           [--recurse-submodules[=<pathspec>]] [--[no-]shallow-submodules]
>> -         [--[no-]remote-submodules] [--jobs <n>] [--] <repository>
>> +         [--[no-]remote-submodules] [--jobs <n>] [--sparse] [--] <repository>
>>           [<directory>]
>>
>>  DESCRIPTION
>> @@ -156,6 +156,12 @@ objects from the source repository into a pack in the cloned repository.
>>         used, neither remote-tracking branches nor the related
>>         configuration variables are created.
>>
>> +--sparse::
>> +       Initialize the sparse-checkout file so the working
>> +       directory starts with only the files in the root
>> +       of the repository. The sparse-checkout file can be
>> +       modified to grow the working directory as needed.
>> +
>>  --mirror::
>>         Set up a mirror of the source repository.  This implies `--bare`.
>>         Compared to `--bare`, `--mirror` not only maps local branches of the
>> diff --git a/builtin/clone.c b/builtin/clone.c
>> index f665b28ccc..d6d49a73ff 100644
>> --- a/builtin/clone.c
>> +++ b/builtin/clone.c
>> @@ -60,6 +60,7 @@ static const char *real_git_dir;
>>  static char *option_upload_pack = "git-upload-pack";
>>  static int option_verbosity;
>>  static int option_progress = -1;
>> +static int option_sparse_checkout;
>>  static enum transport_family family;
>>  static struct string_list option_config = STRING_LIST_INIT_NODUP;
>>  static struct string_list option_required_reference = STRING_LIST_INIT_NODUP;
>> @@ -147,6 +148,8 @@ static struct option builtin_clone_options[] = {
>>         OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
>>         OPT_BOOL(0, "remote-submodules", &option_remote_submodules,
>>                     N_("any cloned submodules will use their remote-tracking branch")),
>> +       OPT_BOOL(0, "sparse", &option_sparse_checkout,
>> +                   N_("initialize sparse-checkout file to include only files at root")),
>>         OPT_END()
>>  };
>>
>> @@ -734,6 +737,27 @@ static void update_head(const struct ref *our, const struct ref *remote,
>>         }
>>  }
>>
>> +static int git_sparse_checkout_init(const char *repo)
>> +{
>> +       struct argv_array argv = ARGV_ARRAY_INIT;
>> +       int result = 0;
>> +       argv_array_pushl(&argv, "-C", repo, "sparse-checkout", "init", NULL);
>> +
>> +       /*
>> +        * We must apply the setting in the current process
>> +        * for the later checkout to use the sparse-checkout file.
>> +        */
>> +       core_apply_sparse_checkout = 1;
>> +
>> +       if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
>> +               error(_("failed to initialize sparse-checkout"));
>> +               result = 1;
>> +       }
> 
> Sigh...so much forking of additional processes.  I'd really rather
> that we were reducing how much of this we are doing in the codebase
> instead of adding more.  Every fork makes following stuff in a
> debugger harder.

At the moment, this is the simplest way to do this interaction. The
init subcommand is doing multiple things, and we can consider moving
this to be a library method instead of builtin-specific code later.

This is not a huge performance hit, as "clone" is called only once
per repo.

>> +
>> +       argv_array_clear(&argv);
>> +       return result;
>> +}
>> +
>>  static int checkout(int submodule_progress)
>>  {
>>         struct object_id oid;
>> @@ -1107,6 +1131,9 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
>>         if (option_required_reference.nr || option_optional_reference.nr)
>>                 setup_reference();
>>
>> +       if (option_sparse_checkout && git_sparse_checkout_init(repo))
>> +               return 1;
>> +
>>         remote = remote_get(option_origin);
>>
>>         strbuf_addf(&default_refspec, "+%s*:%s*", src_ref_prefix,
>> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
>> index 35ab84aabd..b7d5f15830 100755
>> --- a/t/t1091-sparse-checkout-builtin.sh
>> +++ b/t/t1091-sparse-checkout-builtin.sh
>> @@ -87,4 +87,17 @@ test_expect_success 'init with existing sparse-checkout' '
>>         test_cmp expect dir
>>  '
>>
>> +test_expect_success 'clone --sparse' '
>> +       git clone --sparse repo clone &&
>> +       git -C clone sparse-checkout list >actual &&
>> +       cat >expect <<-EOF &&
>> +               /*
>> +               !/*/*
>> +       EOF
>> +       test_cmp expect actual &&
>> +       ls clone >dir &&
>> +       echo a >expect &&
>> +       test_cmp expect dir
> 
> Checking that a toplevel entry is present, but not checking that an
> entry from a subdir is missing as expected?

This test is checking that the file "a" is the _only_ entry in the root
of the repo. The directories "folder1" and "folder2" are not present, since
we are comparing the ls output to "expect".

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 4/9] sparse-checkout: 'add' subcommand
  2019-08-23 23:30   ` Elijah Newren
@ 2019-09-18 13:55     ` Derrick Stolee
  2019-09-18 14:56       ` Elijah Newren
  0 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee @ 2019-09-18 13:55 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On 8/23/2019 7:30 PM, Elijah Newren wrote:
> On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> The 'git sparse-checkout add' subcommand takes a list of patterns
>> over stdin and writes them to the sparse-checkout file. Then, it
>> updates the working directory using 'git read-tree -mu HEAD'.
> 
> As mentioned in response to the cover letter, I'd rather see it take
> patterns as positional arguments (though requiring a '--' argument
> before any patterns that start with a hyphen).  It could also take
> --stdin to read from stdin.
> 
>> Note: if a user adds a negative pattern that would lead to the
>> removal of a non-empty directory, then Git may not delete that
>> directory (on Windows).
> 
> This sounds like you're re-iterating a bug mentioned earlier, but if
> someone in the future comes and reads this comment it might sound like
> you're saying git can avoid clearing a directory for optimization or
> other reasons.  (And, of course, it'd be nice to figure out why this
> bug exists.)
> 
> Another question this brings up, though, is that you worked around
> this bug in 'init' so why would you not also do so for 'add'?  Seems
> inconsistent to me.
> 
>> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
>> ---
>>  Documentation/git-sparse-checkout.txt |  4 ++++
>>  builtin/sparse-checkout.c             | 32 ++++++++++++++++++++++++++-
>>  t/t1091-sparse-checkout-builtin.sh    | 20 +++++++++++++++++
>>  3 files changed, 55 insertions(+), 1 deletion(-)
>>
>> diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
>> index 50c53ee60a..6f540a3443 100644
>> --- a/Documentation/git-sparse-checkout.txt
>> +++ b/Documentation/git-sparse-checkout.txt
>> @@ -34,6 +34,10 @@ COMMANDS
>>         by Git. Add patterns to the sparse-checkout file to
>>         repopulate the working directory.
>>
>> +'add'::
>> +       Add a set of patterns to the sparse-checkout file, as given over
>> +       stdin. Updates the working directory to match the new patterns.
>> +
>>  SPARSE CHECKOUT
>>  ----------------
>>
>> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
>> index 86d24e6295..ec6134fecc 100644
>> --- a/builtin/sparse-checkout.c
>> +++ b/builtin/sparse-checkout.c
>> @@ -8,7 +8,7 @@
>>  #include "strbuf.h"
>>
>>  static char const * const builtin_sparse_checkout_usage[] = {
>> -       N_("git sparse-checkout [init|list]"),
>> +       N_("git sparse-checkout [init|add|list]"),
>>         NULL
>>  };
>>
>> @@ -166,6 +166,34 @@ static int sparse_checkout_init(int argc, const char **argv)
>>         return sc_read_tree();
>>  }
>>
>> +static int sparse_checkout_add(int argc, const char **argv)
>> +{
>> +       struct exclude_list el;
>> +       char *sparse_filename;
>> +       FILE *fp;
>> +       struct strbuf line = STRBUF_INIT;
>> +
>> +       memset(&el, 0, sizeof(el));
>> +
>> +       sparse_filename = get_sparse_checkout_filename();
>> +       add_excludes_from_file_to_list(sparse_filename, "", 0, &el, NULL);
> 
> el is an exclude_list and we call add_excludes_..., but it's actually
> an *include* list.  This is going to cause errors at some point, and
> will cause lots of headaches.
> 
>> +
>> +       fp = fopen(sparse_filename, "w");
>> +       write_excludes_to_file(fp, &el);
>> +
>> +       while (!strbuf_getline(&line, stdin)) {
>> +               strbuf_trim(&line);
>> +               fprintf(fp, "%s\n", line.buf);
>> +       }
> 
> Should we first check whether these excludes are already in the
> sparse-checkout file?
> 
>> +       fclose(fp);
>> +       free(sparse_filename);
>> +
>> +       clear_exclude_list(&el);
>> +
>> +       return sc_read_tree();
> 
> What if someone calls 'git sparse-checkout add' without first calling
> 'git sparse-checkout init'?  As far as I can tell, core.sparseCheckout
> will be unset (i.e. treated as false), meaning that this operation
> will do some work, but result in no changes and a report of success.
> After users try to figure out why it won't work, they eventually run
> 'git sparse-checkout init', which will delete all the entries they
> previously added with the 'add' subcommand.
> 
> What should happen instead?

If someone runs 'git sparse-checkout init' after an 'add', the
sparse-checkout file has contents, so the init does not overwrite
those.

(In the update I'm working on, 'init' doesn't delete folders, so
it will only remove the files that do not match the patterns.)

>> +}
>> +
>>  int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
>>  {
>>         static struct option builtin_sparse_checkout_options[] = {
>> @@ -187,6 +215,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
>>                         return sparse_checkout_list(argc, argv);
>>                 if (!strcmp(argv[0], "init"))
>>                         return sparse_checkout_init(argc, argv);
>> +               if (!strcmp(argv[0], "add"))
>> +                       return sparse_checkout_add(argc, argv);
>>         }
>>
>>         usage_with_options(builtin_sparse_checkout_usage,
>> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
>> index b7d5f15830..499bd8d6d0 100755
>> --- a/t/t1091-sparse-checkout-builtin.sh
>> +++ b/t/t1091-sparse-checkout-builtin.sh
>> @@ -100,4 +100,24 @@ test_expect_success 'clone --sparse' '
>>         test_cmp expect dir
>>  '
>>
>> +test_expect_success 'add to existing sparse-checkout' '
>> +       echo "/folder2/*" | git -C repo sparse-checkout add &&
> 
> I've always been using '/folder2/' in sparse-checkout, without the
> trailing asterisk.  That seems more friendly for cone mode too.  Are
> there benefits to keeping the trailing asterisk?

I think I've been seeing issues with pattern matching on Windows without
the trailing asterisk. I'm currently double-checking to make sure this
is important or not.
 
>> +       cat >expect <<-EOF &&
>> +               /*
>> +               !/*/*
>> +               /folder1/*
>> +               /folder2/*
>> +       EOF
>> +       git -C repo sparse-checkout list >actual &&
>> +       test_cmp expect actual &&
>> +       test_cmp expect repo/.git/info/sparse-checkout &&
>> +       ls repo >dir  &&
>> +       cat >expect <<-EOF &&
>> +               a
>> +               folder1
>> +               folder2
>> +       EOF
>> +       test_cmp expect dir
>> +'
>> +
>>  test_done
>> \ No newline at end of file

I'm trying to fix this newline issue everywhere.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 4/9] sparse-checkout: 'add' subcommand
  2019-09-18 13:55     ` Derrick Stolee
@ 2019-09-18 14:56       ` Elijah Newren
  2019-09-18 17:23         ` Derrick Stolee
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-09-18 14:56 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Derrick Stolee via GitGitGadget, Git Mailing List,
	Junio C Hamano, Derrick Stolee

On Wed, Sep 18, 2019 at 6:55 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 8/23/2019 7:30 PM, Elijah Newren wrote:
> > On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
> > <gitgitgadget@gmail.com> wrote:
> >>
...
> >> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> >> index b7d5f15830..499bd8d6d0 100755
> >> --- a/t/t1091-sparse-checkout-builtin.sh
> >> +++ b/t/t1091-sparse-checkout-builtin.sh
> >> @@ -100,4 +100,24 @@ test_expect_success 'clone --sparse' '
> >>         test_cmp expect dir
> >>  '
> >>
> >> +test_expect_success 'add to existing sparse-checkout' '
> >> +       echo "/folder2/*" | git -C repo sparse-checkout add &&
> >
> > I've always been using '/folder2/' in sparse-checkout, without the
> > trailing asterisk.  That seems more friendly for cone mode too.  Are
> > there benefits to keeping the trailing asterisk?
>
> I think I've been seeing issues with pattern matching on Windows without
> the trailing asterisk. I'm currently double-checking to make sure this
> is important or not.

Can you try with the en/clean-nested-with-ignored topic in pu to see
if that fixes those issues?

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH 4/9] sparse-checkout: 'add' subcommand
  2019-09-18 14:56       ` Elijah Newren
@ 2019-09-18 17:23         ` Derrick Stolee
  0 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-09-18 17:23 UTC (permalink / raw)
  To: Elijah Newren
  Cc: Derrick Stolee via GitGitGadget, Git Mailing List,
	Junio C Hamano, Derrick Stolee

On 9/18/2019 10:56 AM, Elijah Newren wrote:
> On Wed, Sep 18, 2019 at 6:55 AM Derrick Stolee <stolee@gmail.com> wrote:
>>
>> On 8/23/2019 7:30 PM, Elijah Newren wrote:
>>> On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
>>> <gitgitgadget@gmail.com> wrote:
>>>>
> ...
>>>> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
>>>> index b7d5f15830..499bd8d6d0 100755
>>>> --- a/t/t1091-sparse-checkout-builtin.sh
>>>> +++ b/t/t1091-sparse-checkout-builtin.sh
>>>> @@ -100,4 +100,24 @@ test_expect_success 'clone --sparse' '
>>>>         test_cmp expect dir
>>>>  '
>>>>
>>>> +test_expect_success 'add to existing sparse-checkout' '
>>>> +       echo "/folder2/*" | git -C repo sparse-checkout add &&
>>>
>>> I've always been using '/folder2/' in sparse-checkout, without the
>>> trailing asterisk.  That seems more friendly for cone mode too.  Are
>>> there benefits to keeping the trailing asterisk?
>>
>> I think I've been seeing issues with pattern matching on Windows without
>> the trailing asterisk. I'm currently double-checking to make sure this
>> is important or not.
> 
> Can you try with the en/clean-nested-with-ignored topic in pu to see
> if that fixes those issues?

Merging with that branch was very difficult. There is a lot of unshared
history between our branches.

Instead, I tried once more to dig into the strange issue on Windows, and
it appears it is an issue with how the Git for Windows SDK modifies shell
arguments with a "/".

When I ran `git sparse-checkout set "/folder1/*"` it worked.

When I run `git sparse-checkout set "/folder1/"`, the SDK completes that
argument to "C:/git-sdk-64/folder1/" on my machine (something more
complicated on the build machine). It's not actually a bug in the Git
code, but something in the build and test environment.

I can get around it by testing the builtin without using these cone-like
patterns. When using `git sparse-checkout set folder1 folder2` in cone
mode, Git does the right thing.

Sorry for the noise here.

-Stolee


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v2 00/11] New sparse-checkout builtin and "cone" mode
  2019-08-20 15:11 [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode Derrick Stolee via GitGitGadget
                   ` (9 preceding siblings ...)
  2019-08-21 21:52 ` [PATCH 0/9] [RFC] New sparse-checkout builtin and "cone" mode Elijah Newren
@ 2019-09-19 14:43 ` " Derrick Stolee via GitGitGadget
  2019-09-19 14:43   ` [PATCH v2 01/11] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
                     ` (12 more replies)
  10 siblings, 13 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-09-19 14:43 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano

This series makes the sparse-checkout feature more user-friendly. While
there, I also present a way to use a limited set of patterns to gain a
significant performance boost in very large repositories.

Sparse-checkout is only documented as a subsection of the read-tree docs
[1], which makes the feature hard to discover. Users have trouble navigating
the feature, especially at clone time [2], and have even resorted to
creating their own helper tools [3].

This series attempts to solve these problems using a new builtin. Here is a
sample workflow to give a feeling for how it can work:

In an existing repo:

$ git sparse-checkout init
$ ls
myFile1.txt myFile2.txt
$ git sparse-checkout set "/*" "!/*/" /myFolder/
$ ls
myFile1.txt myFile2.txt myFolder
$ ls myFolder
a.c a.h
$ git sparse-checkout disable
$ ls
hiddenFolder myFile1.txt myFile2.txt myFolder

At clone time:

$ git clone --sparse origin repo
$ cd repo
$ ls
myFile1.txt myFile2.txt
$ git sparse-checkout set "/*" "!/*/" /myFolder/
$ ls
myFile1.txt myFile2.txt myFolder

Here are some more specific details:

 * git sparse-checkout init enables core.sparseCheckout and populates the
   sparse-checkout file with patterns that match only the files at root.
   
   
 * git clone learns the --sparse argument to run git sparse-checkout init 
   before the first checkout.
   
   
 * git sparse-checkout set reads patterns from the arguments, or with
   --stdin reads patterns from stdin one per line, then writes them to the
   sparse-checkout file and refreshes the working directory.
   
   
 * git sparse-checkout disable removes the patterns from the sparse-checkout
   file, disables core.sparseCheckout, and refills the working directory.
   
   
 * git sparse-checkout list lists the contents of the sparse-checkout file.
   
   

The documentation for the sparse-checkout feature can now live primarily
with the git-sparse-checkout documentation.

Cone Mode
=========

What really got me interested in this area is a performance problem. If we
have N patterns in the sparse-checkout file and M entries in the index, then
we can perform up to O(N * M) pattern checks in clear_ce_flags(). This
quadratic growth is not sustainable in a repo with 1,000+ patterns and
1,000,000+ index entries.

To solve this problem, I propose a new, more restrictive mode to
sparse-checkout: "cone mode". In this mode, all patterns are based on prefix
matches at a directory level. This can then use hashsets for fast
performance -- O(M) instead of O(N*M). My hashset implementation is based on
the virtual filesystem hook in the VFS for Git custom code [4].

In cone mode, a user specifies a list of folders which the user wants every
file inside. In addition, the cone adds all blobs that are siblings of the
folders in the directory path to that folder. This makes the directories
look "hydrated" as a user drills down to those recursively-closed folders.
These directories are called "parent" folders, as a file matches them only
if the file's immediate parent is that directory.

When building a prototype of this feature, I used a separate file to contain
the list of recursively-closed folders and built the hashsets dynamically
based on that file. In this implementation, I tried to maximize the amount
of backwards-compatibility by storing all data in the sparse-checkout file
using patterns recognized by earlier Git versions.

For example, if we add A/B/C as a recursive folder, then we add the
following patterns to the sparse-checkout file:

/*
!/*/
/A/
!/A/*/
/A/B/
!/A/B/*/
/A/B/C/

The alternating positive/negative patterns say "include everything in this
folder, but exclude everything another level deeper". The final pattern has
no matching negation, so is a recursively closed pattern.

Note that I have some basic warnings to try and check that the
sparse-checkout file doesn't match what would be written by a cone-mode add.
In such a case, Git writes a warning to stderr and continues with the old
pattern matching algorithm. These checks are currently very barebones, and
would need to be updated with more robust checks for things like regex
characters in the middle of the pattern. As review moves forward (and if we
don't change the data storage) then we could spend more time on this.

Thanks, -Stolee

Updates in v2, relative to the RFC:

 * Instead of an 'add' subcommand, use a 'set' subcommand. We can consider
   adding 'add' and/or 'remove' subcommands later.
   
   
 * 'set' reads from the arguments by default. '--stdin' option is available.
   
   
 * A new performance-oriented commit is added at the end.
   
   
 * Patterns no longer end with a trailing asterisk except for the first "/*"
   pattern.
   
   
 * References to a "bug" (that was really a strange GVFS interaction in
   microsoft/git) around deleting outside the cone are removed.
   
   

Things to leave for future patches:

 1. Integrate in 'git worktree add' to copy the sparse-checkout file to a
    worktree-specific file.
    
    
 2. More robustness around detecting non-cone patterns with wildcards in the
    middle of the line.
    
    
 3. 'git clone --sparse-cone' to clone into "cone mode" sparse-checkouts
    (i.e. set 'core.sparseCheckoutCone=true'). This may not be
    super-valuable, as it only starts changing behavior when someone calls
    'git sparse-checkout set', but may be interesting.
    
    

[1] https://git-scm.com/docs/git-read-tree#_sparse_checkoutSparse-checkout
documentation in git-read-tree.

[2] https://stackoverflow.com/a/4909267/127088Is it possible to do a sparse
checkout without checking out the whole repository first?

[3] http://www.marcoyuen.com/articles/2016/06/07/git-sparse.htmlA blog post
of a user's extra "git-sparse" helper.

[4] 
https://github.com/git/git/compare/fc5fd706ff733392053e6180086a4d7f96acc2af...01204f24c5349aa2fb0c474546d768946d315dab
The virtual filesystem hook in microsoft/git.

Derrick Stolee (10):
  sparse-checkout: create builtin with 'list' subcommand
  sparse-checkout: create 'init' subcommand
  clone: add --sparse mode
  sparse-checkout: 'set' subcommand
  sparse-checkout: add '--stdin' option to set subcommand
  sparse-checkout: create 'disable' subcommand
  sparse-checkout: add 'cone' mode
  sparse-checkout: use hashmaps for cone patterns
  sparse-checkout: init and set in cone mode
  unpack-trees: hash less in cone mode

Jeff Hostetler (1):
  trace2: add region in clear_ce_flags

 .gitignore                            |   1 +
 Documentation/config/core.txt         |   7 +-
 Documentation/git-clone.txt           |   8 +-
 Documentation/git-read-tree.txt       |   2 +-
 Documentation/git-sparse-checkout.txt | 148 ++++++++++
 Makefile                              |   1 +
 builtin.h                             |   1 +
 builtin/clone.c                       |  27 ++
 builtin/sparse-checkout.c             | 395 ++++++++++++++++++++++++++
 cache.h                               |   4 +-
 config.c                              |   5 +
 dir.c                                 | 173 ++++++++++-
 dir.h                                 |  31 ++
 environment.c                         |   1 +
 git.c                                 |   1 +
 t/t1091-sparse-checkout-builtin.sh    | 231 +++++++++++++++
 unpack-trees.c                        |  48 ++--
 17 files changed, 1055 insertions(+), 29 deletions(-)
 create mode 100644 Documentation/git-sparse-checkout.txt
 create mode 100644 builtin/sparse-checkout.c
 create mode 100755 t/t1091-sparse-checkout-builtin.sh


base-commit: 468ce99b77a0efaf1ace4c31a7b0a7d036fd9ca1
Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-316%2Fderrickstolee%2Fsparse-checkout%2Fupstream-v2
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-316/derrickstolee/sparse-checkout/upstream-v2
Pull-Request: https://github.com/gitgitgadget/git/pull/316

Range-diff vs v1:

  1:  c37b5f2c29 !  1:  dbaf3de88e sparse-checkout: create builtin with 'list' subcommand
     @@ -15,16 +15,18 @@
          builtin will be the preferred mechanism for manipulating the
          sparse-checkout file and syncing the working directory.
      
     -    For now, create the basics of the builtin. Includes a single
     -    subcommand, "git sparse-checkout list", that lists the patterns
     -    currently in the sparse-checkout file. Test that these patterns
     -    are parsed and written correctly to the output.
     +    The `$GIT_DIR/info/sparse-checkout` file defines the skip-
     +    worktree reference bitmap. When Git updates the working
     +    directory, it updates the skip-worktree bits in the index
     +    based on this file and removes or restores files in the
     +    working copy to match.
      
          The documentation provided is adapted from the "git read-tree"
          documentation with a few edits for clarity in the new context.
          Extra sections are added to hint toward a future change to
     -    a moer restricted pattern set.
     +    a more restricted pattern set.
      
     +    Helped-by: Elijah Newren <newren@gmail.com>
          Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
      
       diff --git a/.gitignore b/.gitignore
     @@ -114,8 +116,8 @@
      +files.
      +
      +While `$GIT_DIR/info/sparse-checkout` is usually used to specify what
     -+files are in, you can also specify what files are _not_ in, using
     -+negate patterns. For example, to remove the file `unwanted`:
     ++files are included, you can also specify what files are _not_ included,
     ++using negative patterns. For example, to remove the file `unwanted`:
      +
      +----------------
      +/*
     @@ -191,29 +193,24 @@
      +	NULL
      +};
      +
     -+struct opts_sparse_checkout {
     -+	const char *subcommand;
     -+	int read_stdin;
     -+} opts;
     -+
      +static char *get_sparse_checkout_filename(void)
      +{
      +	return git_pathdup("info/sparse-checkout");
      +}
      +
     -+static void write_excludes_to_file(FILE *fp, struct exclude_list *el)
     ++static void write_patterns_to_file(FILE *fp, struct pattern_list *pl)
      +{
      +	int i;
      +
     -+	for (i = 0; i < el->nr; i++) {
     -+		struct exclude *x = el->excludes[i];
     ++	for (i = 0; i < pl->nr; i++) {
     ++		struct path_pattern *p = pl->patterns[i];
      +
     -+		if (x->flags & EXC_FLAG_NEGATIVE)
     ++		if (p->flags & PATTERN_FLAG_NEGATIVE)
      +			fprintf(fp, "!");
      +
     -+		fprintf(fp, "%s", x->pattern);
     ++		fprintf(fp, "%s", p->pattern);
      +
     -+		if (x->flags & EXC_FLAG_MUSTBEDIR)
     ++		if (p->flags & PATTERN_FLAG_MUSTBEDIR)
      +			fprintf(fp, "/");
      +
      +		fprintf(fp, "\n");
     @@ -222,23 +219,23 @@
      +
      +static int sparse_checkout_list(int argc, const char **argv)
      +{
     -+	struct exclude_list el;
     ++	struct pattern_list pl;
      +	char *sparse_filename;
      +	int res;
      +
     -+	memset(&el, 0, sizeof(el));
     ++	memset(&pl, 0, sizeof(pl));
      +
      +	sparse_filename = get_sparse_checkout_filename();
     -+	res = add_excludes_from_file_to_list(sparse_filename, "", 0, &el, NULL);
     ++	res = add_patterns_from_file_to_list(sparse_filename, "", 0, &pl, NULL);
      +	free(sparse_filename);
      +
      +	if (res < 0) {
     -+		warning(_("failed to parse sparse-checkout file; it may not exist"));
     ++		warning(_("this worktree is not sparse (sparse-checkout file may not exist)"));
      +		return 0;
      +	}
      +
     -+	write_excludes_to_file(stdout, &el);
     -+	clear_exclude_list(&el);
     ++	write_patterns_to_file(stdout, &pl);
     ++	clear_pattern_list(&pl);
      +
      +	return 0;
      +}
     @@ -253,12 +250,13 @@
      +		usage_with_options(builtin_sparse_checkout_usage,
      +				   builtin_sparse_checkout_options);
      +
     -+	git_config(git_default_config, NULL);
      +	argc = parse_options(argc, argv, prefix,
      +			     builtin_sparse_checkout_options,
      +			     builtin_sparse_checkout_usage,
      +			     PARSE_OPT_STOP_AT_NON_OPTION);
      +
     ++	git_config(git_default_config, NULL);
     ++
      +	if (argc > 0) {
      +		if (!strcmp(argv[0], "list"))
      +			return sparse_checkout_list(argc, argv);
     @@ -313,7 +311,7 @@
      +test_expect_success 'git sparse-checkout list (empty)' '
      +	git -C repo sparse-checkout list >list 2>err &&
      +	test_line_count = 0 list &&
     -+	test_i18ngrep "failed to parse sparse-checkout file; it may not exist" err
     ++	test_i18ngrep "this worktree is not sparse (sparse-checkout file may not exist)" err
      +'
      +
      +test_expect_success 'git sparse-checkout list (populated)' '
     @@ -335,4 +333,4 @@
      +'
      +
      +test_done
     - \ No newline at end of file
     ++
  2:  e6e982e5a6 !  2:  412211f5dd sparse-checkout: create 'init' subcommand
     @@ -51,7 +51,7 @@
       	return 0;
       }
       
     -+static int sc_read_tree(void)
     ++static int update_working_directory(void)
      +{
      +	struct argv_array argv = ARGV_ARRAY_INIT;
      +	int result = 0;
     @@ -69,45 +69,25 @@
      +static int sc_enable_config(void)
      +{
      +	struct argv_array argv = ARGV_ARRAY_INIT;
     -+	int result = 0;
     -+	argv_array_pushl(&argv, "config", "--add", "core.sparseCheckout", "true", NULL);
      +
     -+	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
     -+		error(_("failed to enable core.sparseCheckout"));
     -+		result = 1;
     ++	if (git_config_set_gently("extensions.worktreeConfig", "true")) {
     ++		error(_("failed to set extensions.worktreeConfig setting"));
     ++		return 1;
      +	}
      +
     -+	argv_array_clear(&argv);
     -+	return result;
     -+}
     -+
     -+static int delete_directory(const struct object_id *oid, struct strbuf *base,
     -+		const char *pathname, unsigned mode, int stage, void *context)
     -+{
     -+	struct strbuf dirname = STRBUF_INIT;
     -+	struct stat sb;
     -+
     -+	strbuf_addstr(&dirname, the_repository->worktree);
     -+	strbuf_addch(&dirname, '/');
     -+	strbuf_addstr(&dirname, pathname);
     -+
     -+	if (stat(dirname.buf, &sb) || !(sb.st_mode & S_IFDIR))
     -+		return 0;
     ++	argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", "true", NULL);
      +
     -+	if (remove_dir_recursively(&dirname, 0))
     -+		warning(_("failed to remove directory '%s'"),
     -+			dirname.buf);
     ++	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
     ++		error(_("failed to enable core.sparseCheckout"));
     ++		return 1;
     ++	}
      +
     -+	strbuf_release(&dirname);
      +	return 0;
      +}
      +
      +static int sparse_checkout_init(int argc, const char **argv)
      +{
     -+	struct tree *t;
     -+	struct object_id oid;
     -+	struct exclude_list el;
     -+	static struct pathspec pathspec;
     ++	struct pattern_list pl;
      +	char *sparse_filename;
      +	FILE *fp;
      +	int res;
     @@ -115,10 +95,10 @@
      +	if (sc_enable_config())
      +		return 1;
      +
     -+	memset(&el, 0, sizeof(el));
     ++	memset(&pl, 0, sizeof(pl));
      +
      +	sparse_filename = get_sparse_checkout_filename();
     -+	res = add_excludes_from_file_to_list(sparse_filename, "", 0, &el, NULL);
     ++	res = add_patterns_from_file_to_list(sparse_filename, "", 0, &pl, NULL);
      +
      +	/* If we already have a sparse-checkout file, use it. */
      +	if (res >= 0) {
     @@ -129,28 +109,11 @@
      +	/* initial mode: all blobs at root */
      +	fp = fopen(sparse_filename, "w");
      +	free(sparse_filename);
     -+	fprintf(fp, "/*\n!/*/*\n");
     ++	fprintf(fp, "/*\n!/*/\n");
      +	fclose(fp);
      +
     -+	/* remove all directories in the root, if tracked by Git */
     -+	if (get_oid("HEAD", &oid)) {
     -+		/* assume we are in a fresh repo */
     -+		return 0;
     -+	}
     -+
     -+	t = parse_tree_indirect(&oid);
     -+
     -+	parse_pathspec(&pathspec, PATHSPEC_ALL_MAGIC &
     -+				  ~(PATHSPEC_FROMTOP | PATHSPEC_LITERAL),
     -+		       PATHSPEC_PREFER_CWD,
     -+		       "", NULL);
     -+
     -+	if (read_tree_recursive(the_repository, t, "", 0, 0, &pathspec,
     -+				delete_directory, NULL))
     -+		return 1;
     -+
      +reset_dir:
     -+	return sc_read_tree();
     ++	return update_working_directory();
      +}
      +
       int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
     @@ -177,7 +140,7 @@
      +	git -C repo sparse-checkout init &&
      +	cat >expect <<-EOF &&
      +		/*
     -+		!/*/*
     ++		!/*/
      +	EOF
      +	test_cmp expect repo/.git/info/sparse-checkout &&
      +	git -C repo config --list >config &&
     @@ -191,27 +154,28 @@
      +	git -C repo sparse-checkout list >actual &&
      +	cat >expect <<-EOF &&
      +		/*
     -+		!/*/*
     ++		!/*/
      +	EOF
      +	test_cmp expect actual
      +'
      +
      +test_expect_success 'init with existing sparse-checkout' '
     -+	echo "/folder1/*" >> repo/.git/info/sparse-checkout &&
     ++	echo "*folder*" >> repo/.git/info/sparse-checkout &&
      +	git -C repo sparse-checkout init &&
      +	cat >expect <<-EOF &&
      +		/*
     -+		!/*/*
     -+		/folder1/*
     ++		!/*/
     ++		*folder*
      +	EOF
      +	test_cmp expect repo/.git/info/sparse-checkout &&
      +	ls repo >dir  &&
      +	cat >expect <<-EOF &&
      +		a
      +		folder1
     ++		folder2
      +	EOF
      +	test_cmp expect dir
      +'
      +
       test_done
     - \ No newline at end of file
     + 
  3:  4ccd36b396 !  3:  fef41b794a clone: add --sparse mode
     @@ -13,13 +13,18 @@
          sparse-checkout file to include the following patterns:
      
                  /*
     -            !/*/*
     +            !/*/
      
          These patterns include every file in the root directory, but
          no directories. This allows a repo to include files like a
          README or a bootstrapping script to grow enlistments from that
          point.
      
     +    During the 'git sparse-checkout init' call, we must first look
     +    to see if HEAD is valid, or else we will fail while trying to
     +    update the working directory. The first checkout will actually
     +    update the working directory correctly.
     +
          Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
      
       diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt
     @@ -107,6 +112,30 @@
       
       	strbuf_addf(&default_refspec, "+%s*:%s*", src_ref_prefix,
      
     + diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
     + --- a/builtin/sparse-checkout.c
     + +++ b/builtin/sparse-checkout.c
     +@@
     + 	char *sparse_filename;
     + 	FILE *fp;
     + 	int res;
     ++	struct object_id oid;
     + 
     + 	if (sc_enable_config())
     + 		return 1;
     +@@
     + 	fprintf(fp, "/*\n!/*/\n");
     + 	fclose(fp);
     + 
     ++	if (get_oid("HEAD", &oid)) {
     ++		/* assume we are in a fresh repo */
     ++		return 0;
     ++	}
     ++
     + reset_dir:
     + 	return update_working_directory();
     + }
     +
       diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
       --- a/t/t1091-sparse-checkout-builtin.sh
       +++ b/t/t1091-sparse-checkout-builtin.sh
     @@ -119,7 +148,7 @@
      +	git -C clone sparse-checkout list >actual &&
      +	cat >expect <<-EOF &&
      +		/*
     -+		!/*/*
     ++		!/*/
      +	EOF
      +	test_cmp expect actual &&
      +	ls clone >dir &&
     @@ -128,4 +157,4 @@
      +'
      +
       test_done
     - \ No newline at end of file
     + 
  4:  0f095e85d5 !  4:  9a78f9ea0f sparse-checkout: 'add' subcommand
     @@ -1,14 +1,15 @@
      Author: Derrick Stolee <dstolee@microsoft.com>
      
     -    sparse-checkout: 'add' subcommand
     +    sparse-checkout: 'set' subcommand
      
     -    The 'git sparse-checkout add' subcommand takes a list of patterns
     -    over stdin and writes them to the sparse-checkout file. Then, it
     +    The 'git sparse-checkout set' subcommand takes a list of patterns
     +    as arguments and writes them to the sparse-checkout file. Then, it
          updates the working directory using 'git read-tree -mu HEAD'.
      
     -    Note: if a user adds a negative pattern that would lead to the
     -    removal of a non-empty directory, then Git may not delete that
     -    directory (on Windows).
     +    The 'set' subcommand will replace the entire contents of the
     +    sparse-checkout file. The write_patterns_and_update() method is
     +    extracted from cmd_sparse_checkout() to make it easier to implement
     +    'add' and/or 'remove' subcommands in the future.
      
          Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
      
     @@ -19,9 +20,10 @@
       	by Git. Add patterns to the sparse-checkout file to
       	repopulate the working directory.
       
     -+'add'::
     -+	Add a set of patterns to the sparse-checkout file, as given over
     -+	stdin. Updates the working directory to match the new patterns.
     ++'set'::
     ++	Write a set of patterns to the sparse-checkout file, as given as
     ++	a list of arguments following the 'set' subcommand. Update the
     ++	working directory to match the new patterns.
      +
       SPARSE CHECKOUT
       ----------------
     @@ -35,40 +37,39 @@
       
       static char const * const builtin_sparse_checkout_usage[] = {
      -	N_("git sparse-checkout [init|list]"),
     -+	N_("git sparse-checkout [init|add|list]"),
     ++	N_("git sparse-checkout [init|list|set] <options>"),
       	NULL
       };
       
      @@
     - 	return sc_read_tree();
     + 	return update_working_directory();
       }
       
     -+static int sparse_checkout_add(int argc, const char **argv)
     ++static int write_patterns_and_update(struct pattern_list *pl)
      +{
     -+	struct exclude_list el;
      +	char *sparse_filename;
      +	FILE *fp;
     -+	struct strbuf line = STRBUF_INIT;
     -+
     -+	memset(&el, 0, sizeof(el));
      +
      +	sparse_filename = get_sparse_checkout_filename();
     -+	add_excludes_from_file_to_list(sparse_filename, "", 0, &el, NULL);
     -+
      +	fp = fopen(sparse_filename, "w");
     -+	write_excludes_to_file(fp, &el);
     -+
     -+	while (!strbuf_getline(&line, stdin)) {
     -+		strbuf_trim(&line);
     -+		fprintf(fp, "%s\n", line.buf);
     -+	}
     -+
     ++	write_patterns_to_file(fp, pl);
      +	fclose(fp);
      +	free(sparse_filename);
      +
     -+	clear_exclude_list(&el);
     ++	clear_pattern_list(pl);
     ++	return update_working_directory();
     ++}
      +
     -+	return sc_read_tree();
     ++static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
     ++{
     ++	int i;
     ++	struct pattern_list pl;
     ++	memset(&pl, 0, sizeof(pl));
     ++
     ++	for (i = 1; i < argc; i++)
     ++		add_pattern(argv[i], NULL, 0, &pl, 0);
     ++
     ++	return write_patterns_and_update(&pl);
      +}
      +
       int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
     @@ -78,8 +79,8 @@
       			return sparse_checkout_list(argc, argv);
       		if (!strcmp(argv[0], "init"))
       			return sparse_checkout_init(argc, argv);
     -+		if (!strcmp(argv[0], "add"))
     -+			return sparse_checkout_add(argc, argv);
     ++		if (!strcmp(argv[0], "set"))
     ++			return sparse_checkout_set(argc, argv, prefix);
       	}
       
       	usage_with_options(builtin_sparse_checkout_usage,
     @@ -91,13 +92,12 @@
       	test_cmp expect dir
       '
       
     -+test_expect_success 'add to existing sparse-checkout' '
     -+	echo "/folder2/*" | git -C repo sparse-checkout add &&
     ++test_expect_success 'set sparse-checkout using builtin' '
     ++	git -C repo sparse-checkout set "/*" "!/*/" "*folder*" &&
      +	cat >expect <<-EOF &&
      +		/*
     -+		!/*/*
     -+		/folder1/*
     -+		/folder2/*
     ++		!/*/
     ++		*folder*
      +	EOF
      +	git -C repo sparse-checkout list >actual &&
      +	test_cmp expect actual &&
     @@ -112,4 +112,4 @@
      +'
      +
       test_done
     - \ No newline at end of file
     + 
  -:  ---------- >  5:  21a0165be7 sparse-checkout: add '--stdin' option to set subcommand
  5:  5f332b799f !  6:  b62b76013f sparse-checkout: create 'disable' subcommand
     @@ -13,8 +13,8 @@
       --- a/Documentation/git-sparse-checkout.txt
       +++ b/Documentation/git-sparse-checkout.txt
      @@
     - 	Add a set of patterns to the sparse-checkout file, as given over
     - 	stdin. Updates the working directory to match the new patterns.
     + 	a list of arguments following the 'set' subcommand. Update the
     + 	working directory to match the new patterns.
       
      +'disable'::
      +	Remove the sparse-checkout file, set `core.sparseCheckout` to
     @@ -30,7 +30,7 @@
      +To repopulate the working directory with all files, use the
      +`git sparse-checkout disable` command.
      +
     -+Sparse checkout support in 'git read-tree' and similar commands is
     ++Sparse checkout support in 'git checkout' and similar commands is
      +disabled by default. You need to set `core.sparseCheckout` to `true`
      +in order to have sparse checkout support.
      +
     @@ -67,8 +67,8 @@
       #include "strbuf.h"
       
       static char const * const builtin_sparse_checkout_usage[] = {
     --	N_("git sparse-checkout [init|add|list]"),
     -+	N_("git sparse-checkout [init|add|list|disable]"),
     +-	N_("git sparse-checkout [init|list|set] <options>"),
     ++	N_("git sparse-checkout [init|list|set|disable] <options>"),
       	NULL
       };
       
     @@ -80,36 +80,32 @@
      +static int sc_set_config(int mode)
       {
       	struct argv_array argv = ARGV_ARRAY_INIT;
     - 	int result = 0;
     --	argv_array_pushl(&argv, "config", "--add", "core.sparseCheckout", "true", NULL);
     -+	argv_array_pushl(&argv, "config", "--add", "core.sparseCheckout", NULL);
     + 
     +@@
     + 		return 1;
     + 	}
     + 
     +-	argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", "true", NULL);
     ++	argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", NULL);
      +
     -+	switch (mode) {
     -+	case 1:
     ++	if (mode)
      +		argv_array_pushl(&argv, "true", NULL);
     -+		break;
     -+
     -+	case 0:
     ++	else
      +		argv_array_pushl(&argv, "false", NULL);
     -+		break;
     -+
     -+	default:
     -+		die(_("invalid config mode"));
     -+	}
       
       	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
       		error(_("failed to enable core.sparseCheckout"));
      @@
     - 	FILE *fp;
       	int res;
     + 	struct object_id oid;
       
      -	if (sc_enable_config())
      +	if (sc_set_config(1))
       		return 1;
       
     - 	memset(&el, 0, sizeof(el));
     + 	memset(&pl, 0, sizeof(pl));
      @@
     - 	return sc_read_tree();
     + 	return write_patterns_and_update(&pl);
       }
       
      +static int sparse_checkout_disable(int argc, const char **argv)
     @@ -125,7 +121,7 @@
      +	fprintf(fp, "/*\n");
      +	fclose(fp);
      +
     -+	if (sc_read_tree())
     ++	if (update_working_directory())
      +		die(_("error while refreshing working directory"));
      +
      +	unlink(sparse_filename);
     @@ -139,8 +135,8 @@
       	static struct option builtin_sparse_checkout_options[] = {
      @@
       			return sparse_checkout_init(argc, argv);
     - 		if (!strcmp(argv[0], "add"))
     - 			return sparse_checkout_add(argc, argv);
     + 		if (!strcmp(argv[0], "set"))
     + 			return sparse_checkout_set(argc, argv, prefix);
      +		if (!strcmp(argv[0], "disable"))
      +			return sparse_checkout_disable(argc, argv);
       	}
     @@ -170,4 +166,4 @@
      +'
      +
       test_done
     - \ No newline at end of file
     + 
  6:  86f12dc77d !  7:  25642f8df2 trace2:experiment: clear_ce_flags_1
     @@ -1,11 +1,16 @@
      Author: Jeff Hostetler <jeffhost@microsoft.com>
      
     -    trace2:experiment: clear_ce_flags_1
     +    trace2: add region in clear_ce_flags
      
     -    The clear_ce_flags_1 method is used by many types of calls to
     -    unpack_trees(). Add trace2 regions around the method, including
     -    some flag information, so we can get granular performance data
     -    during experiments.
     +    When Git updates the working directory with the sparse-checkout
     +    feature enabled, the unpack_trees() method calls clear_ce_flags()
     +    to update the skip-wortree bits on the cache entries. This
     +    check can be expensive, depending on the patterns used.
     +
     +    Add trace2 regions around the method, including some flag
     +    information, so we can get granular performance data during
     +    experiments. This data will be used to measure improvements
     +    to the pattern-matching algorithms for sparse-checkout.
      
          Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
          Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
     @@ -14,7 +19,7 @@
       --- a/unpack-trees.c
       +++ b/unpack-trees.c
      @@
     - 			  struct exclude_list *el)
     + 			  struct pattern_list *pl)
       {
       	static struct strbuf prefix = STRBUF_INIT;
      +	char label[100];
     @@ -25,14 +30,14 @@
      -	return clear_ce_flags_1(istate,
      +	xsnprintf(label, sizeof(label), "clear_ce_flags(0x%08lx,0x%08lx)",
      +		  (unsigned long)select_mask, (unsigned long)clear_mask);
     -+	trace2_region_enter("exp", label, the_repository);
     ++	trace2_region_enter("unpack_trees", label, the_repository);
      +	rval = clear_ce_flags_1(istate,
       				istate->cache,
       				istate->cache_nr,
       				&prefix,
       				select_mask, clear_mask,
     - 				el, 0);
     -+	trace2_region_leave("exp", label, the_repository);
     + 				pl, 0);
     ++	trace2_region_leave("unpack_trees", label, the_repository);
      +
      +	return rval;
       }
  7:  19d664a5da !  8:  84511255d1 sparse-checkout: add 'cone' mode
     @@ -7,14 +7,14 @@
          If there are 1,000 patterns and 1,000,000 entries, this time can
          be very significant.
      
     -    Create a new 'cone' mode for the core.sparseCheckout config
     -    option, and adjust the parser to set an appropriate enum value.
     +    Create a new Boolean config option, core.sparseCheckoutCone, to
     +    indicate that we expect the sparse-checkout file to contain a
     +    more limited set of patterns. This is a separate config setting
     +    from core.sparseCheckout to avoid breaking older clients by
     +    introcuding a tri-state option.
      
     -    While adjusting the type of this variable, rename it from
     -    core_apply_sparse_checkout to core_sparse_checkout. This will
     -    help avoid parallel changes from hitting type issues, and we
     -    can guarantee that all uses now consider the enum values instead
     -    of the int value.
     +    The config option does nothing right now, but will be expanded
     +    upon in a later commit.
      
          Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
      
     @@ -49,7 +49,7 @@
      +inclusion/exclusion rules. These can result in O(N*M) pattern matches when
      +updating the index, where N is the number of patterns and M is the number
      +of paths in the index. To combat this performance issue, a more restricted
     -+pattern set is allowed when `core.spareCheckout` is set to `cone`.
     ++pattern set is allowed when `core.spareCheckoutCone` is enabled.
      +
      +The accepted patterns in the cone pattern set are:
      +
     @@ -67,7 +67,7 @@
      +
      +```
      +/*
     -+!/*/*
     ++!/*/
      +```
      +
      +This says "include everything in root, but nothing two levels below root."
     @@ -77,18 +77,18 @@
      +
      +```
      +/*
     -+!/*/*
     -+/A/*
     -+!/A/*/*
     -+/A/B/*
     -+!/A/B/*/*
     -+/A/B/C/*
     ++!/*/
     ++/A/
     ++!/A/*/
     ++/A/B/
     ++!/A/B/*/
     ++/A/B/C/
      +```
      +
      +Here, order matters, so the negative patterns are overridden by the positive
      +patterns that appear lower in the file.
      +
     -+If `core.sparseCheckout=cone`, then Git will parse the sparse-checkout file
     ++If `core.sparseCheckoutCone=true`, then Git will parse the sparse-checkout file
      +expecting patterns of these types. Git will warn if the patterns do not match.
      +If the patterns do match the expected format, then Git will use faster hash-
      +based algorithms to compute inclusion in the sparse-checkout.
     @@ -97,76 +97,6 @@
       --------
       
      
     - diff --git a/builtin/clone.c b/builtin/clone.c
     - --- a/builtin/clone.c
     - +++ b/builtin/clone.c
     -@@
     - 	 * We must apply the setting in the current process
     - 	 * for the later checkout to use the sparse-checkout file.
     - 	 */
     --	core_apply_sparse_checkout = 1;
     -+	core_sparse_checkout = SPARSE_CHECKOUT_FULL;
     - 
     - 	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
     - 		error(_("failed to initialize sparse-checkout"));
     -
     - diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
     - --- a/builtin/sparse-checkout.c
     - +++ b/builtin/sparse-checkout.c
     -@@
     - 	return result;
     - }
     - 
     --static int sc_set_config(int mode)
     -+static int sc_set_config(enum sparse_checkout_mode mode)
     - {
     - 	struct argv_array argv = ARGV_ARRAY_INIT;
     - 	int result = 0;
     - 	argv_array_pushl(&argv, "config", "--add", "core.sparseCheckout", NULL);
     - 
     - 	switch (mode) {
     --	case 1:
     -+	case SPARSE_CHECKOUT_FULL:
     - 		argv_array_pushl(&argv, "true", NULL);
     - 		break;
     - 
     --	case 0:
     -+	case SPARSE_CHECKOUT_CONE:
     -+		argv_array_pushl(&argv, "cone", NULL);
     -+		break;
     -+
     -+	case SPARSE_CHECKOUT_NONE:
     - 		argv_array_pushl(&argv, "false", NULL);
     - 		break;
     - 
     -@@
     - 	FILE *fp;
     - 	int res;
     - 
     --	if (sc_set_config(1))
     -+	if (sc_set_config(SPARSE_CHECKOUT_FULL))
     - 		return 1;
     - 
     - 	memset(&el, 0, sizeof(el));
     -@@
     - 	char *sparse_filename;
     - 	FILE *fp;
     - 
     --	if (sc_set_config(1))
     -+	if (sc_set_config(SPARSE_CHECKOUT_FULL))
     - 		die(_("failed to change config"));
     - 
     - 	sparse_filename = get_sparse_checkout_filename();
     -@@
     - 	unlink(sparse_filename);
     - 	free(sparse_filename);
     - 
     --	return sc_set_config(0);
     -+	return sc_set_config(SPARSE_CHECKOUT_NONE);
     - }
     - 
     - int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
     -
       diff --git a/cache.h b/cache.h
       --- a/cache.h
       +++ b/cache.h
     @@ -180,12 +110,8 @@
       extern int protect_ntfs;
       extern const char *core_fsmonitor;
       
     -+enum sparse_checkout_mode {
     -+	SPARSE_CHECKOUT_NONE = 0,
     -+	SPARSE_CHECKOUT_FULL = 1,
     -+	SPARSE_CHECKOUT_CONE = 2,
     -+};
     -+enum sparse_checkout_mode core_sparse_checkout;
     ++int core_apply_sparse_checkout;
     ++int core_sparse_checkout_cone;
      +
       /*
        * Include broken refs in all ref iterations, which will
     @@ -195,32 +121,26 @@
       --- a/config.c
       +++ b/config.c
      @@
     + 		return 0;
       	}
       
     - 	if (!strcmp(var, "core.sparsecheckout")) {
     --		core_apply_sparse_checkout = git_config_bool(var, value);
     -+		int result = git_parse_maybe_bool(value);
     ++	if (!strcmp(var, "core.sparsecheckoutcone")) {
     ++		core_sparse_checkout_cone = git_config_bool(var, value);
     ++		return 0;
     ++	}
      +
     -+		if (result < 0) {
     -+			core_sparse_checkout = SPARSE_CHECKOUT_NONE;
     -+
     -+			if (!strcasecmp(value, "cone"))
     -+				core_sparse_checkout = SPARSE_CHECKOUT_CONE;
     -+		} else
     -+			core_sparse_checkout = result;
     + 	if (!strcmp(var, "core.precomposeunicode")) {
     + 		precomposed_unicode = git_config_bool(var, value);
       		return 0;
     - 	}
     - 
      
       diff --git a/environment.c b/environment.c
       --- a/environment.c
       +++ b/environment.c
      @@
     - enum object_creation_mode object_creation_mode = OBJECT_CREATION_MODE;
       char *notes_ref_name;
       int grafts_replace_parents = 1;
     --int core_apply_sparse_checkout;
     -+enum sparse_checkout_mode core_sparse_checkout;
     + int core_apply_sparse_checkout;
     ++int core_sparse_checkout_cone;
       int merge_log_config = -1;
       int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */
       unsigned long pack_size_limit_cfg;
     @@ -233,7 +153,7 @@
       '
       
      +test_expect_success 'cone mode: match patterns' '
     -+	git -C repo config --replace-all core.sparseCheckout cone &&
     ++	git -C repo config --worktree core.sparseCheckoutCone true &&
      +	rm -rf repo/a repo/folder1 repo/folder2 &&
      +	git -C repo read-tree -mu HEAD &&
      +	git -C repo reset --hard &&
     @@ -249,16 +169,3 @@
       test_expect_success 'sparse-checkout disable' '
       	git -C repo sparse-checkout disable &&
       	test_path_is_missing repo/.git/info/sparse-checkout &&
     -
     - diff --git a/unpack-trees.c b/unpack-trees.c
     - --- a/unpack-trees.c
     - +++ b/unpack-trees.c
     -@@
     - 
     - 	trace_performance_enter();
     - 	memset(&el, 0, sizeof(el));
     --	if (!core_apply_sparse_checkout || !o->update)
     -+	if (!core_sparse_checkout || !o->update)
     - 		o->skip_sparse_checkout = 1;
     - 	if (!o->skip_sparse_checkout) {
     - 		char *sparse = git_pathdup("info/sparse-checkout");
  8:  b99acea4a0 <  -:  ---------- sparse-checkout: use hashmaps for cone patterns
  9:  568fda2d03 <  -:  ---------- sparse-checkout: init and add in cone mode
  -:  ---------- >  9:  95a3285bc6 sparse-checkout: use hashmaps for cone patterns
  -:  ---------- > 10:  995c5b8e2b sparse-checkout: init and set in cone mode
  -:  ---------- > 11:  1d4321488e unpack-trees: hash less in cone mode

-- 
gitgitgadget

^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v2 01/11] sparse-checkout: create builtin with 'list' subcommand
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
@ 2019-09-19 14:43   ` Derrick Stolee via GitGitGadget
  2019-10-05 19:22     ` Elijah Newren
  2019-09-19 14:43   ` [PATCH v2 02/11] sparse-checkout: create 'init' subcommand Derrick Stolee via GitGitGadget
                     ` (11 subsequent siblings)
  12 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-09-19 14:43 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The sparse-checkout feature is mostly hidden to users, as its
only documentation is supplementary information in the docs for
'git read-tree'. In addition, users need to know how to edit the
.git/info/sparse-checkout file with the right patterns, then run
the appropriate 'git read-tree -mu HEAD' command. Keeping the
working directory in sync with the sparse-checkout file requires
care.

Begin an effort to make the sparse-checkout feature a porcelain
feature by creating a new 'git sparse-checkout' builtin. This
builtin will be the preferred mechanism for manipulating the
sparse-checkout file and syncing the working directory.

The `$GIT_DIR/info/sparse-checkout` file defines the skip-
worktree reference bitmap. When Git updates the working
directory, it updates the skip-worktree bits in the index
based on this file and removes or restores files in the
working copy to match.

The documentation provided is adapted from the "git read-tree"
documentation with a few edits for clarity in the new context.
Extra sections are added to hint toward a future change to
a more restricted pattern set.

Helped-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 .gitignore                            |  1 +
 Documentation/git-read-tree.txt       |  2 +-
 Documentation/git-sparse-checkout.txt | 90 +++++++++++++++++++++++++++
 Makefile                              |  1 +
 builtin.h                             |  1 +
 builtin/sparse-checkout.c             | 86 +++++++++++++++++++++++++
 git.c                                 |  1 +
 t/t1091-sparse-checkout-builtin.sh    | 51 +++++++++++++++
 8 files changed, 232 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/git-sparse-checkout.txt
 create mode 100644 builtin/sparse-checkout.c
 create mode 100755 t/t1091-sparse-checkout-builtin.sh

diff --git a/.gitignore b/.gitignore
index 4470d7cfc0..5ccc3d00dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -156,6 +156,7 @@
 /git-show-branch
 /git-show-index
 /git-show-ref
+/git-sparse-checkout
 /git-stage
 /git-stash
 /git-status
diff --git a/Documentation/git-read-tree.txt b/Documentation/git-read-tree.txt
index d271842608..da33f84f33 100644
--- a/Documentation/git-read-tree.txt
+++ b/Documentation/git-read-tree.txt
@@ -436,7 +436,7 @@ support.
 SEE ALSO
 --------
 linkgit:git-write-tree[1]; linkgit:git-ls-files[1];
-linkgit:gitignore[5]
+linkgit:gitignore[5]; linkgit:git-sparse-checkout[1];
 
 GIT
 ---
diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
new file mode 100644
index 0000000000..cdef451642
--- /dev/null
+++ b/Documentation/git-sparse-checkout.txt
@@ -0,0 +1,90 @@
+git-sparse-checkout(1)
+=======================
+
+NAME
+----
+git-sparse-checkout - Initialize and modify the sparse-checkout
+configuration, which reduces the checkout to a set of directories
+given by a list of prefixes.
+
+
+SYNOPSIS
+--------
+[verse]
+'git sparse-checkout <subcommand> [options]'
+
+
+DESCRIPTION
+-----------
+
+Initialize and modify the sparse-checkout configuration, which reduces
+the checkout to a set of directories given by a list of prefixes.
+
+
+COMMANDS
+--------
+'list'::
+	Provide a list of the contents in the sparse-checkout file.
+
+
+SPARSE CHECKOUT
+----------------
+
+"Sparse checkout" allows populating the working directory sparsely.
+It uses the skip-worktree bit (see linkgit:git-update-index[1]) to tell
+Git whether a file in the working directory is worth looking at. If
+the skip-worktree bit is set, then the file is ignored in the working
+directory. Git will not populate the contents of those files, which
+makes a sparse checkout helpful when working in a repository with many
+files, but only a few are important to the current user.
+
+The `$GIT_DIR/info/sparse-checkout` file is used to define the
+skip-worktree reference bitmap. When Git updates the working
+directory, it resets the skip-worktree bit in the index based on this
+file. If an entry
+matches a pattern in this file, skip-worktree will not be set on
+that entry. Otherwise, skip-worktree will be set.
+
+Then it compares the new skip-worktree value with the previous one. If
+skip-worktree turns from set to unset, it will add the corresponding
+file back. If it turns from unset to set, that file will be removed.
+
+## FULL PATTERN SET
+
+By default, the sparse-checkout file uses the same syntax as `.gitignore`
+files.
+
+While `$GIT_DIR/info/sparse-checkout` is usually used to specify what
+files are included, you can also specify what files are _not_ included,
+using negative patterns. For example, to remove the file `unwanted`:
+
+----------------
+/*
+!unwanted
+----------------
+
+Another tricky thing is fully repopulating the working directory when you
+no longer want sparse checkout. You cannot just disable "sparse
+checkout" because skip-worktree bits are still in the index and your working
+directory is still sparsely populated. You should re-populate the working
+directory with the `$GIT_DIR/info/sparse-checkout` file content as
+follows:
+
+----------------
+/*
+----------------
+
+Then you can disable sparse checkout. Sparse checkout support in 'git
+read-tree' and similar commands is disabled by default. You need to
+set `core.sparseCheckout` to `true` in order to have sparse checkout
+support.
+
+SEE ALSO
+--------
+
+linkgit:git-read-tree[1]
+linkgit:gitignore[5]
+
+GIT
+---
+Part of the linkgit:git[1] suite
diff --git a/Makefile b/Makefile
index f58bf14c7b..f3322b75dd 100644
--- a/Makefile
+++ b/Makefile
@@ -1121,6 +1121,7 @@ BUILTIN_OBJS += builtin/shortlog.o
 BUILTIN_OBJS += builtin/show-branch.o
 BUILTIN_OBJS += builtin/show-index.o
 BUILTIN_OBJS += builtin/show-ref.o
+BUILTIN_OBJS += builtin/sparse-checkout.o
 BUILTIN_OBJS += builtin/stash.o
 BUILTIN_OBJS += builtin/stripspace.o
 BUILTIN_OBJS += builtin/submodule--helper.o
diff --git a/builtin.h b/builtin.h
index ec7e0954c4..d517068faa 100644
--- a/builtin.h
+++ b/builtin.h
@@ -223,6 +223,7 @@ int cmd_shortlog(int argc, const char **argv, const char *prefix);
 int cmd_show(int argc, const char **argv, const char *prefix);
 int cmd_show_branch(int argc, const char **argv, const char *prefix);
 int cmd_show_index(int argc, const char **argv, const char *prefix);
+int cmd_sparse_checkout(int argc, const char **argv, const char *prefix);
 int cmd_status(int argc, const char **argv, const char *prefix);
 int cmd_stash(int argc, const char **argv, const char *prefix);
 int cmd_stripspace(int argc, const char **argv, const char *prefix);
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
new file mode 100644
index 0000000000..eed9625a05
--- /dev/null
+++ b/builtin/sparse-checkout.c
@@ -0,0 +1,86 @@
+#include "builtin.h"
+#include "config.h"
+#include "dir.h"
+#include "parse-options.h"
+#include "pathspec.h"
+#include "repository.h"
+#include "run-command.h"
+#include "strbuf.h"
+
+static char const * const builtin_sparse_checkout_usage[] = {
+	N_("git sparse-checkout [list]"),
+	NULL
+};
+
+static char *get_sparse_checkout_filename(void)
+{
+	return git_pathdup("info/sparse-checkout");
+}
+
+static void write_patterns_to_file(FILE *fp, struct pattern_list *pl)
+{
+	int i;
+
+	for (i = 0; i < pl->nr; i++) {
+		struct path_pattern *p = pl->patterns[i];
+
+		if (p->flags & PATTERN_FLAG_NEGATIVE)
+			fprintf(fp, "!");
+
+		fprintf(fp, "%s", p->pattern);
+
+		if (p->flags & PATTERN_FLAG_MUSTBEDIR)
+			fprintf(fp, "/");
+
+		fprintf(fp, "\n");
+	}
+}
+
+static int sparse_checkout_list(int argc, const char **argv)
+{
+	struct pattern_list pl;
+	char *sparse_filename;
+	int res;
+
+	memset(&pl, 0, sizeof(pl));
+
+	sparse_filename = get_sparse_checkout_filename();
+	res = add_patterns_from_file_to_list(sparse_filename, "", 0, &pl, NULL);
+	free(sparse_filename);
+
+	if (res < 0) {
+		warning(_("this worktree is not sparse (sparse-checkout file may not exist)"));
+		return 0;
+	}
+
+	write_patterns_to_file(stdout, &pl);
+	clear_pattern_list(&pl);
+
+	return 0;
+}
+
+int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
+{
+	static struct option builtin_sparse_checkout_options[] = {
+		OPT_END(),
+	};
+
+	if (argc == 2 && !strcmp(argv[1], "-h"))
+		usage_with_options(builtin_sparse_checkout_usage,
+				   builtin_sparse_checkout_options);
+
+	argc = parse_options(argc, argv, prefix,
+			     builtin_sparse_checkout_options,
+			     builtin_sparse_checkout_usage,
+			     PARSE_OPT_STOP_AT_NON_OPTION);
+
+	git_config(git_default_config, NULL);
+
+	if (argc > 0) {
+		if (!strcmp(argv[0], "list"))
+			return sparse_checkout_list(argc, argv);
+	}
+
+	usage_with_options(builtin_sparse_checkout_usage,
+			   builtin_sparse_checkout_options);
+}
diff --git a/git.c b/git.c
index c2eec470c9..e775fbad42 100644
--- a/git.c
+++ b/git.c
@@ -576,6 +576,7 @@ static struct cmd_struct commands[] = {
 	{ "show-branch", cmd_show_branch, RUN_SETUP },
 	{ "show-index", cmd_show_index },
 	{ "show-ref", cmd_show_ref, RUN_SETUP },
+	{ "sparse-checkout", cmd_sparse_checkout, RUN_SETUP | NEED_WORK_TREE },
 	{ "stage", cmd_add, RUN_SETUP | NEED_WORK_TREE },
 	/*
 	 * NEEDSWORK: Until the builtin stash is thoroughly robust and no
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
new file mode 100755
index 0000000000..46e7b2dded
--- /dev/null
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+test_description='sparse checkout builtin tests'
+
+. ./test-lib.sh
+
+test_expect_success 'setup' '
+	git init repo &&
+	(
+		cd repo &&
+		echo "initial" >a &&
+		mkdir folder1 folder2 deep &&
+		mkdir deep/deeper1 deep/deeper2 &&
+		mkdir deep/deeper1/deepest &&
+		cp a folder1 &&
+		cp a folder2 &&
+		cp a deep &&
+		cp a deep/deeper1 &&
+		cp a deep/deeper2 &&
+		cp a deep/deeper1/deepest &&
+		git add . &&
+		git commit -m "initial commit"
+	)
+'
+
+test_expect_success 'git sparse-checkout list (empty)' '
+	git -C repo sparse-checkout list >list 2>err &&
+	test_line_count = 0 list &&
+	test_i18ngrep "this worktree is not sparse (sparse-checkout file may not exist)" err
+'
+
+test_expect_success 'git sparse-checkout list (populated)' '
+	test_when_finished rm -f repo/.git/info/sparse-checkout &&
+	cat >repo/.git/info/sparse-checkout <<-EOF &&
+		/folder1/*
+		/deep/
+		**/a
+		!*bin*
+	EOF
+	git -C repo sparse-checkout list >list &&
+	cat >expect <<-EOF &&
+		/folder1/*
+		/deep/
+		**/a
+		!*bin*
+	EOF
+	test_cmp expect list
+'
+
+test_done
+
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v2 02/11] sparse-checkout: create 'init' subcommand
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
  2019-09-19 14:43   ` [PATCH v2 01/11] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
@ 2019-09-19 14:43   ` Derrick Stolee via GitGitGadget
  2019-10-05 19:34     ` Elijah Newren
  2019-09-19 14:43   ` [PATCH v2 03/11] clone: add --sparse mode Derrick Stolee via GitGitGadget
                     ` (10 subsequent siblings)
  12 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-09-19 14:43 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

Getting started with a sparse-checkout file can be daunting. Help
users start their sparse enlistment using 'git sparse-checkout init'.
This will set 'core.sparseCheckout=true' in their config, write
an initial set of patterns to the sparse-checkout file, and update
their working directory.

Using 'git read-tree' to clear directories does not work cleanly
on Windows, so manually delete directories that are tracked by Git
before running read-tree.

The use of running another process for 'git read-tree' is likely
suboptimal, but that can be improved in a later change, if valuable.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-sparse-checkout.txt |  7 +++
 builtin/sparse-checkout.c             | 69 ++++++++++++++++++++++++++-
 t/t1091-sparse-checkout-builtin.sh    | 41 ++++++++++++++++
 3 files changed, 116 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index cdef451642..9707ef93b1 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -26,6 +26,13 @@ COMMANDS
 'list'::
 	Provide a list of the contents in the sparse-checkout file.
 
+'init'::
+	Enable the `core.sparseCheckout` setting. If the
+	sparse-checkout file does not exist, then populate it with
+	patterns that match every file in the root directory and
+	no other directories, then will remove all directories tracked
+	by Git. Add patterns to the sparse-checkout file to
+	repopulate the working directory.
 
 SPARSE CHECKOUT
 ----------------
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index eed9625a05..895479970d 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -8,7 +8,7 @@
 #include "strbuf.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
-	N_("git sparse-checkout [list]"),
+	N_("git sparse-checkout [init|list]"),
 	NULL
 };
 
@@ -59,6 +59,71 @@ static int sparse_checkout_list(int argc, const char **argv)
 	return 0;
 }
 
+static int update_working_directory(void)
+{
+	struct argv_array argv = ARGV_ARRAY_INIT;
+	int result = 0;
+	argv_array_pushl(&argv, "read-tree", "-m", "-u", "HEAD", NULL);
+
+	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to update index with new sparse-checkout paths"));
+		result = 1;
+	}
+
+	argv_array_clear(&argv);
+	return result;
+}
+
+static int sc_enable_config(void)
+{
+	struct argv_array argv = ARGV_ARRAY_INIT;
+
+	if (git_config_set_gently("extensions.worktreeConfig", "true")) {
+		error(_("failed to set extensions.worktreeConfig setting"));
+		return 1;
+	}
+
+	argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", "true", NULL);
+
+	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to enable core.sparseCheckout"));
+		return 1;
+	}
+
+	return 0;
+}
+
+static int sparse_checkout_init(int argc, const char **argv)
+{
+	struct pattern_list pl;
+	char *sparse_filename;
+	FILE *fp;
+	int res;
+
+	if (sc_enable_config())
+		return 1;
+
+	memset(&pl, 0, sizeof(pl));
+
+	sparse_filename = get_sparse_checkout_filename();
+	res = add_patterns_from_file_to_list(sparse_filename, "", 0, &pl, NULL);
+
+	/* If we already have a sparse-checkout file, use it. */
+	if (res >= 0) {
+		free(sparse_filename);
+		goto reset_dir;
+	}
+
+	/* initial mode: all blobs at root */
+	fp = fopen(sparse_filename, "w");
+	free(sparse_filename);
+	fprintf(fp, "/*\n!/*/\n");
+	fclose(fp);
+
+reset_dir:
+	return update_working_directory();
+}
+
 int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 {
 	static struct option builtin_sparse_checkout_options[] = {
@@ -79,6 +144,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 	if (argc > 0) {
 		if (!strcmp(argv[0], "list"))
 			return sparse_checkout_list(argc, argv);
+		if (!strcmp(argv[0], "init"))
+			return sparse_checkout_init(argc, argv);
 	}
 
 	usage_with_options(builtin_sparse_checkout_usage,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 46e7b2dded..a6c6b336c9 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -47,5 +47,46 @@ test_expect_success 'git sparse-checkout list (populated)' '
 	test_cmp expect list
 '
 
+test_expect_success 'git sparse-checkout init' '
+	git -C repo sparse-checkout init &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+	EOF
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	git -C repo config --list >config &&
+	test_i18ngrep "core.sparsecheckout=true" config &&
+	ls repo >dir  &&
+	echo a >expect &&
+	test_cmp expect dir
+'
+
+test_expect_success 'git sparse-checkout list after init' '
+	git -C repo sparse-checkout list >actual &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+	EOF
+	test_cmp expect actual
+'
+
+test_expect_success 'init with existing sparse-checkout' '
+	echo "*folder*" >> repo/.git/info/sparse-checkout &&
+	git -C repo sparse-checkout init &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+		*folder*
+	EOF
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_done
 
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v2 03/11] clone: add --sparse mode
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
  2019-09-19 14:43   ` [PATCH v2 01/11] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
  2019-09-19 14:43   ` [PATCH v2 02/11] sparse-checkout: create 'init' subcommand Derrick Stolee via GitGitGadget
@ 2019-09-19 14:43   ` Derrick Stolee via GitGitGadget
  2019-10-05 19:40     ` Elijah Newren
  2019-09-19 14:43   ` [PATCH v2 05/11] sparse-checkout: add '--stdin' option to set subcommand Derrick Stolee via GitGitGadget
                     ` (9 subsequent siblings)
  12 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-09-19 14:43 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

When someone wants to clone a large repository, but plans to work
using a sparse-checkout file, they either need to do a full
checkout first and then reduce the patterns they included, or
clone with --no-checkout, set up their patterns, and then run
a checkout manually. This requires knowing a lot about the repo
shape and how sparse-checkout works.

Add a new '--sparse' option to 'git clone' that initializes the
sparse-checkout file to include the following patterns:

	/*
	!/*/

These patterns include every file in the root directory, but
no directories. This allows a repo to include files like a
README or a bootstrapping script to grow enlistments from that
point.

During the 'git sparse-checkout init' call, we must first look
to see if HEAD is valid, or else we will fail while trying to
update the working directory. The first checkout will actually
update the working directory correctly.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-clone.txt        |  8 +++++++-
 builtin/clone.c                    | 27 +++++++++++++++++++++++++++
 builtin/sparse-checkout.c          |  6 ++++++
 t/t1091-sparse-checkout-builtin.sh | 13 +++++++++++++
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt
index 5fc97f14de..03299a8adb 100644
--- a/Documentation/git-clone.txt
+++ b/Documentation/git-clone.txt
@@ -15,7 +15,7 @@ SYNOPSIS
 	  [--dissociate] [--separate-git-dir <git dir>]
 	  [--depth <depth>] [--[no-]single-branch] [--no-tags]
 	  [--recurse-submodules[=<pathspec>]] [--[no-]shallow-submodules]
-	  [--[no-]remote-submodules] [--jobs <n>] [--] <repository>
+	  [--[no-]remote-submodules] [--jobs <n>] [--sparse] [--] <repository>
 	  [<directory>]
 
 DESCRIPTION
@@ -156,6 +156,12 @@ objects from the source repository into a pack in the cloned repository.
 	used, neither remote-tracking branches nor the related
 	configuration variables are created.
 
+--sparse::
+	Initialize the sparse-checkout file so the working
+	directory starts with only the files in the root
+	of the repository. The sparse-checkout file can be
+	modified to grow the working directory as needed.
+
 --mirror::
 	Set up a mirror of the source repository.  This implies `--bare`.
 	Compared to `--bare`, `--mirror` not only maps local branches of the
diff --git a/builtin/clone.c b/builtin/clone.c
index a693e6ca44..16f4e8b6fd 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -58,6 +58,7 @@ static const char *real_git_dir;
 static char *option_upload_pack = "git-upload-pack";
 static int option_verbosity;
 static int option_progress = -1;
+static int option_sparse_checkout;
 static enum transport_family family;
 static struct string_list option_config = STRING_LIST_INIT_NODUP;
 static struct string_list option_required_reference = STRING_LIST_INIT_NODUP;
@@ -145,6 +146,8 @@ static struct option builtin_clone_options[] = {
 	OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
 	OPT_BOOL(0, "remote-submodules", &option_remote_submodules,
 		    N_("any cloned submodules will use their remote-tracking branch")),
+	OPT_BOOL(0, "sparse", &option_sparse_checkout,
+		    N_("initialize sparse-checkout file to include only files at root")),
 	OPT_END()
 };
 
@@ -723,6 +726,27 @@ static void update_head(const struct ref *our, const struct ref *remote,
 	}
 }
 
+static int git_sparse_checkout_init(const char *repo)
+{
+	struct argv_array argv = ARGV_ARRAY_INIT;
+	int result = 0;
+	argv_array_pushl(&argv, "-C", repo, "sparse-checkout", "init", NULL);
+
+	/*
+	 * We must apply the setting in the current process
+	 * for the later checkout to use the sparse-checkout file.
+	 */
+	core_apply_sparse_checkout = 1;
+
+	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to initialize sparse-checkout"));
+		result = 1;
+	}
+
+	argv_array_clear(&argv);
+	return result;
+}
+
 static int checkout(int submodule_progress)
 {
 	struct object_id oid;
@@ -1096,6 +1120,9 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
 	if (option_required_reference.nr || option_optional_reference.nr)
 		setup_reference();
 
+	if (option_sparse_checkout && git_sparse_checkout_init(repo))
+		return 1;
+
 	remote = remote_get(option_origin);
 
 	strbuf_addf(&default_refspec, "+%s*:%s*", src_ref_prefix,
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 895479970d..656e6ebdd5 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -99,6 +99,7 @@ static int sparse_checkout_init(int argc, const char **argv)
 	char *sparse_filename;
 	FILE *fp;
 	int res;
+	struct object_id oid;
 
 	if (sc_enable_config())
 		return 1;
@@ -120,6 +121,11 @@ static int sparse_checkout_init(int argc, const char **argv)
 	fprintf(fp, "/*\n!/*/\n");
 	fclose(fp);
 
+	if (get_oid("HEAD", &oid)) {
+		/* assume we are in a fresh repo */
+		return 0;
+	}
+
 reset_dir:
 	return update_working_directory();
 }
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index a6c6b336c9..26b4ce9acd 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -88,5 +88,18 @@ test_expect_success 'init with existing sparse-checkout' '
 	test_cmp expect dir
 '
 
+test_expect_success 'clone --sparse' '
+	git clone --sparse repo clone &&
+	git -C clone sparse-checkout list >actual &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+	EOF
+	test_cmp expect actual &&
+	ls clone >dir &&
+	echo a >expect &&
+	test_cmp expect dir
+'
+
 test_done
 
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v2 04/11] sparse-checkout: 'set' subcommand
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
                     ` (3 preceding siblings ...)
  2019-09-19 14:43   ` [PATCH v2 05/11] sparse-checkout: add '--stdin' option to set subcommand Derrick Stolee via GitGitGadget
@ 2019-09-19 14:43   ` Derrick Stolee via GitGitGadget
  2019-10-05 22:44     ` Elijah Newren
  2019-09-19 14:43   ` [PATCH v2 06/11] sparse-checkout: create 'disable' subcommand Derrick Stolee via GitGitGadget
                     ` (7 subsequent siblings)
  12 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-09-19 14:43 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The 'git sparse-checkout set' subcommand takes a list of patterns
as arguments and writes them to the sparse-checkout file. Then, it
updates the working directory using 'git read-tree -mu HEAD'.

The 'set' subcommand will replace the entire contents of the
sparse-checkout file. The write_patterns_and_update() method is
extracted from cmd_sparse_checkout() to make it easier to implement
'add' and/or 'remove' subcommands in the future.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-sparse-checkout.txt |  5 +++++
 builtin/sparse-checkout.c             | 31 ++++++++++++++++++++++++++-
 t/t1091-sparse-checkout-builtin.sh    | 19 ++++++++++++++++
 3 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index 9707ef93b1..87813e5797 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -34,6 +34,11 @@ COMMANDS
 	by Git. Add patterns to the sparse-checkout file to
 	repopulate the working directory.
 
+'set'::
+	Write a set of patterns to the sparse-checkout file, as given as
+	a list of arguments following the 'set' subcommand. Update the
+	working directory to match the new patterns.
+
 SPARSE CHECKOUT
 ----------------
 
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 656e6ebdd5..13333fba6a 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -8,7 +8,7 @@
 #include "strbuf.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
-	N_("git sparse-checkout [init|list]"),
+	N_("git sparse-checkout [init|list|set] <options>"),
 	NULL
 };
 
@@ -130,6 +130,33 @@ static int sparse_checkout_init(int argc, const char **argv)
 	return update_working_directory();
 }
 
+static int write_patterns_and_update(struct pattern_list *pl)
+{
+	char *sparse_filename;
+	FILE *fp;
+
+	sparse_filename = get_sparse_checkout_filename();
+	fp = fopen(sparse_filename, "w");
+	write_patterns_to_file(fp, pl);
+	fclose(fp);
+	free(sparse_filename);
+
+	clear_pattern_list(pl);
+	return update_working_directory();
+}
+
+static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
+{
+	int i;
+	struct pattern_list pl;
+	memset(&pl, 0, sizeof(pl));
+
+	for (i = 1; i < argc; i++)
+		add_pattern(argv[i], NULL, 0, &pl, 0);
+
+	return write_patterns_and_update(&pl);
+}
+
 int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 {
 	static struct option builtin_sparse_checkout_options[] = {
@@ -152,6 +179,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 			return sparse_checkout_list(argc, argv);
 		if (!strcmp(argv[0], "init"))
 			return sparse_checkout_init(argc, argv);
+		if (!strcmp(argv[0], "set"))
+			return sparse_checkout_set(argc, argv, prefix);
 	}
 
 	usage_with_options(builtin_sparse_checkout_usage,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 26b4ce9acd..f21ea61494 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -101,5 +101,24 @@ test_expect_success 'clone --sparse' '
 	test_cmp expect dir
 '
 
+test_expect_success 'set sparse-checkout using builtin' '
+	git -C repo sparse-checkout set "/*" "!/*/" "*folder*" &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+		*folder*
+	EOF
+	git -C repo sparse-checkout list >actual &&
+	test_cmp expect actual &&
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_done
 
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v2 05/11] sparse-checkout: add '--stdin' option to set subcommand
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
                     ` (2 preceding siblings ...)
  2019-09-19 14:43   ` [PATCH v2 03/11] clone: add --sparse mode Derrick Stolee via GitGitGadget
@ 2019-09-19 14:43   ` Derrick Stolee via GitGitGadget
  2019-09-19 14:43   ` [PATCH v2 04/11] sparse-checkout: 'set' subcommand Derrick Stolee via GitGitGadget
                     ` (8 subsequent siblings)
  12 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-09-19 14:43 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The 'git sparse-checkout set' subcommand takes a list of patterns
and places them in the sparse-checkout file. Then, it updates the
working directory to match those patterns. For a large list of
patterns, the command-line call can get very cumbersome.

Add a '--stdin' option to instead read patterns over standard in.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 builtin/sparse-checkout.c          | 35 ++++++++++++++++++++++++++++--
 t/t1091-sparse-checkout-builtin.sh | 20 +++++++++++++++++
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 13333fba6a..f726fcd6b8 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -145,14 +145,45 @@ static int write_patterns_and_update(struct pattern_list *pl)
 	return update_working_directory();
 }
 
+static char const * const builtin_sparse_checkout_set_usage[] = {
+	N_("git sparse-checkout set [--stdin|<patterns>]"),
+	NULL
+};
+
+static struct sparse_checkout_set_opts {
+	int use_stdin;
+} set_opts;
+
 static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
 {
 	int i;
 	struct pattern_list pl;
+
+	static struct option builtin_sparse_checkout_set_options[] = {
+		OPT_BOOL(0, "stdin", &set_opts.use_stdin,
+			 N_("read patterns from standard in")),
+		OPT_END(),
+	};
+
 	memset(&pl, 0, sizeof(pl));
 
-	for (i = 1; i < argc; i++)
-		add_pattern(argv[i], NULL, 0, &pl, 0);
+	argc = parse_options(argc, argv, prefix,
+			     builtin_sparse_checkout_set_options,
+			     builtin_sparse_checkout_set_usage,
+			     PARSE_OPT_KEEP_UNKNOWN);
+
+	if (set_opts.use_stdin) {
+		struct strbuf line = STRBUF_INIT;
+
+		while (!strbuf_getline(&line, stdin)) {
+			size_t len;
+			char *buf = strbuf_detach(&line, &len);
+			add_pattern(buf, buf, len, &pl, 0);
+		}
+	} else {
+		for (i = 0; i < argc; i++)
+			add_pattern(argv[i], argv[i], strlen(argv[i]), &pl, 0);
+	}
 
 	return write_patterns_and_update(&pl);
 }
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index f21ea61494..02ba9ec314 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -120,5 +120,25 @@ test_expect_success 'set sparse-checkout using builtin' '
 	test_cmp expect dir
 '
 
+test_expect_success 'set sparse-checkout using --stdin' '
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+		/folder1/
+		/folder2/
+	EOF
+	git -C repo sparse-checkout set --stdin <expect &&
+	git -C repo sparse-checkout list >actual &&
+	test_cmp expect actual &&
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_done
 
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v2 06/11] sparse-checkout: create 'disable' subcommand
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
                     ` (4 preceding siblings ...)
  2019-09-19 14:43   ` [PATCH v2 04/11] sparse-checkout: 'set' subcommand Derrick Stolee via GitGitGadget
@ 2019-09-19 14:43   ` Derrick Stolee via GitGitGadget
  2019-10-06  4:10     ` Elijah Newren
  2019-09-19 14:43   ` [PATCH v2 07/11] trace2: add region in clear_ce_flags Jeff Hostetler via GitGitGadget
                     ` (6 subsequent siblings)
  12 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-09-19 14:43 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The instructions for disabling a sparse-checkout to a full
working directory are complicated and non-intuitive. Add a
subcommand, 'git sparse-checkout disable', to perform those
steps for the user.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-sparse-checkout.txt | 26 ++++++++-----------
 builtin/sparse-checkout.c             | 37 ++++++++++++++++++++++++---
 t/t1091-sparse-checkout-builtin.sh    | 15 +++++++++++
 3 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index 87813e5797..da95b28b1c 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -39,6 +39,10 @@ COMMANDS
 	a list of arguments following the 'set' subcommand. Update the
 	working directory to match the new patterns.
 
+'disable'::
+	Remove the sparse-checkout file, set `core.sparseCheckout` to
+	`false`, and restore the working directory to include all files.
+
 SPARSE CHECKOUT
 ----------------
 
@@ -61,6 +65,13 @@ Then it compares the new skip-worktree value with the previous one. If
 skip-worktree turns from set to unset, it will add the corresponding
 file back. If it turns from unset to set, that file will be removed.
 
+To repopulate the working directory with all files, use the
+`git sparse-checkout disable` command.
+
+Sparse checkout support in 'git checkout' and similar commands is
+disabled by default. You need to set `core.sparseCheckout` to `true`
+in order to have sparse checkout support.
+
 ## FULL PATTERN SET
 
 By default, the sparse-checkout file uses the same syntax as `.gitignore`
@@ -75,21 +86,6 @@ using negative patterns. For example, to remove the file `unwanted`:
 !unwanted
 ----------------
 
-Another tricky thing is fully repopulating the working directory when you
-no longer want sparse checkout. You cannot just disable "sparse
-checkout" because skip-worktree bits are still in the index and your working
-directory is still sparsely populated. You should re-populate the working
-directory with the `$GIT_DIR/info/sparse-checkout` file content as
-follows:
-
-----------------
-/*
-----------------
-
-Then you can disable sparse checkout. Sparse checkout support in 'git
-read-tree' and similar commands is disabled by default. You need to
-set `core.sparseCheckout` to `true` in order to have sparse checkout
-support.
 
 SEE ALSO
 --------
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index f726fcd6b8..f858f0b1b5 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -8,7 +8,7 @@
 #include "strbuf.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
-	N_("git sparse-checkout [init|list|set] <options>"),
+	N_("git sparse-checkout [init|list|set|disable] <options>"),
 	NULL
 };
 
@@ -74,7 +74,7 @@ static int update_working_directory(void)
 	return result;
 }
 
-static int sc_enable_config(void)
+static int sc_set_config(int mode)
 {
 	struct argv_array argv = ARGV_ARRAY_INIT;
 
@@ -83,7 +83,12 @@ static int sc_enable_config(void)
 		return 1;
 	}
 
-	argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", "true", NULL);
+	argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", NULL);
+
+	if (mode)
+		argv_array_pushl(&argv, "true", NULL);
+	else
+		argv_array_pushl(&argv, "false", NULL);
 
 	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
 		error(_("failed to enable core.sparseCheckout"));
@@ -101,7 +106,7 @@ static int sparse_checkout_init(int argc, const char **argv)
 	int res;
 	struct object_id oid;
 
-	if (sc_enable_config())
+	if (sc_set_config(1))
 		return 1;
 
 	memset(&pl, 0, sizeof(pl));
@@ -188,6 +193,28 @@ static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
 	return write_patterns_and_update(&pl);
 }
 
+static int sparse_checkout_disable(int argc, const char **argv)
+{
+	char *sparse_filename;
+	FILE *fp;
+
+	if (sc_set_config(1))
+		die(_("failed to change config"));
+
+	sparse_filename = get_sparse_checkout_filename();
+	fp = fopen(sparse_filename, "w");
+	fprintf(fp, "/*\n");
+	fclose(fp);
+
+	if (update_working_directory())
+		die(_("error while refreshing working directory"));
+
+	unlink(sparse_filename);
+	free(sparse_filename);
+
+	return sc_set_config(0);
+}
+
 int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 {
 	static struct option builtin_sparse_checkout_options[] = {
@@ -212,6 +239,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 			return sparse_checkout_init(argc, argv);
 		if (!strcmp(argv[0], "set"))
 			return sparse_checkout_set(argc, argv, prefix);
+		if (!strcmp(argv[0], "disable"))
+			return sparse_checkout_disable(argc, argv);
 	}
 
 	usage_with_options(builtin_sparse_checkout_usage,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 02ba9ec314..22fa032d6d 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -140,5 +140,20 @@ test_expect_success 'set sparse-checkout using --stdin' '
 	test_cmp expect dir
 '
 
+test_expect_success 'sparse-checkout disable' '
+	git -C repo sparse-checkout disable &&
+	test_path_is_missing repo/.git/info/sparse-checkout &&
+	git -C repo config --list >config &&
+	test_i18ngrep "core.sparsecheckout=false" config &&
+	ls repo >dir &&
+	cat >expect <<-EOF &&
+		a
+		deep
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_done
 
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v2 07/11] trace2: add region in clear_ce_flags
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
                     ` (5 preceding siblings ...)
  2019-09-19 14:43   ` [PATCH v2 06/11] sparse-checkout: create 'disable' subcommand Derrick Stolee via GitGitGadget
@ 2019-09-19 14:43   ` Jeff Hostetler via GitGitGadget
  2019-10-06  4:13     ` Elijah Newren
  2019-09-19 14:43   ` [PATCH v2 08/11] sparse-checkout: add 'cone' mode Derrick Stolee via GitGitGadget
                     ` (5 subsequent siblings)
  12 siblings, 1 reply; 196+ messages in thread
From: Jeff Hostetler via GitGitGadget @ 2019-09-19 14:43 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

When Git updates the working directory with the sparse-checkout
feature enabled, the unpack_trees() method calls clear_ce_flags()
to update the skip-wortree bits on the cache entries. This
check can be expensive, depending on the patterns used.

Add trace2 regions around the method, including some flag
information, so we can get granular performance data during
experiments. This data will be used to measure improvements
to the pattern-matching algorithms for sparse-checkout.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 unpack-trees.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/unpack-trees.c b/unpack-trees.c
index cd548f4fa2..26be8f3569 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1404,15 +1404,23 @@ static int clear_ce_flags(struct index_state *istate,
 			  struct pattern_list *pl)
 {
 	static struct strbuf prefix = STRBUF_INIT;
+	char label[100];
+	int rval;
 
 	strbuf_reset(&prefix);
 
-	return clear_ce_flags_1(istate,
+	xsnprintf(label, sizeof(label), "clear_ce_flags(0x%08lx,0x%08lx)",
+		  (unsigned long)select_mask, (unsigned long)clear_mask);
+	trace2_region_enter("unpack_trees", label, the_repository);
+	rval = clear_ce_flags_1(istate,
 				istate->cache,
 				istate->cache_nr,
 				&prefix,
 				select_mask, clear_mask,
 				pl, 0);
+	trace2_region_leave("unpack_trees", label, the_repository);
+
+	return rval;
 }
 
 /*
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v2 09/11] sparse-checkout: use hashmaps for cone patterns
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
                     ` (7 preceding siblings ...)
  2019-09-19 14:43   ` [PATCH v2 08/11] sparse-checkout: add 'cone' mode Derrick Stolee via GitGitGadget
@ 2019-09-19 14:43   ` Derrick Stolee via GitGitGadget
  2019-09-19 20:59     ` Derrick Stolee
  2019-09-19 14:43   ` [PATCH v2 10/11] sparse-checkout: init and set in cone mode Derrick Stolee via GitGitGadget
                     ` (3 subsequent siblings)
  12 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-09-19 14:43 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The parent and recursive patterns allowed by the "cone mode"
option in sparse-checkout are restrictive enough that we
can avoid using the regex parsing. Everything is based on
prefix matches, so we can use hashsets to store the prefixes
from the sparse-checkout file. When checking a path, we can
strip path entries from the path and check the hashset for
an exact match.

As a test, I created a cone-mode sparse-checkout file for the
Linux repository that actually includes every file. This was
constructed by taking every folder in the Linux repo and creating
the pattern pairs here:

	/$folder/
	!/$folder/*/

This resulted in a sparse-checkout file sith 8,296 patterns.
Running 'git read-tree -mu HEAD' on this file had the following
performance:

	core.sparseCheckout=false: 0.21 s (0.00 s)
	 core.sparseCheckout=true: 3.75 s (3.50 s)
	 core.sparseCheckout=cone: 0.23 s (0.01 s)

The times in parentheses above correspond to the time spent
in the first clear_ce_flags() call, according to the trace2
performance traces.

While this example is contrived, it demonstrates how these
patterns can slow the sparse-checkout feature.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 dir.c                              | 173 +++++++++++++++++++++++++++--
 dir.h                              |  27 +++++
 t/t1091-sparse-checkout-builtin.sh |  11 +-
 3 files changed, 202 insertions(+), 9 deletions(-)

diff --git a/dir.c b/dir.c
index 34972abdaf..4fc57187e9 100644
--- a/dir.c
+++ b/dir.c
@@ -599,6 +599,109 @@ void parse_path_pattern(const char **pattern,
 	*patternlen = len;
 }
 
+static int pl_hashmap_cmp(const void *unused_cmp_data,
+			  const void *a, const void *b, const void *key)
+{
+	const struct pattern_entry *ee1 = (const struct pattern_entry *)a;
+	const struct pattern_entry *ee2 = (const struct pattern_entry *)b;
+
+	size_t min_len = ee1->patternlen <= ee2->patternlen
+			 ? ee1->patternlen
+			 : ee2->patternlen;
+
+	return strncmp(ee1->pattern, ee2->pattern, min_len);
+}
+
+static void add_pattern_to_hashsets(struct pattern_list *pl, struct path_pattern *given)
+{
+	struct pattern_entry *translated;
+	char *truncated;
+	char *data = NULL;
+
+	if (!pl->use_cone_patterns)
+		return;
+
+	if (!strcmp(given->pattern, "/*"))
+		return;
+
+	if (given->patternlen > 2 &&
+	    !strcmp(given->pattern + given->patternlen - 2, "/*")) {
+		if (!(given->flags & PATTERN_FLAG_NEGATIVE)) {
+			/* Not a cone pattern. */
+			pl->use_cone_patterns = 0;
+			warning(_("unrecognized pattern: '%s'"), given->pattern);
+			goto clear_hashmaps;
+		}
+
+		truncated = xstrdup(given->pattern);
+		truncated[given->patternlen - 2] = 0;
+
+		translated = xmalloc(sizeof(struct pattern_entry));
+		translated->pattern = truncated;
+		translated->patternlen = given->patternlen - 2;
+		hashmap_entry_init(translated,
+				   memhash(translated->pattern, translated->patternlen));
+
+		if (!hashmap_get(&pl->recursive_hashmap, translated, NULL)) {
+			/* We did not see the "parent" included */
+			warning(_("unrecognized negative pattern: '%s'"),
+				given->pattern);
+			free(truncated);
+			free(translated);
+			goto clear_hashmaps;
+		}
+
+		hashmap_add(&pl->parent_hashmap, translated);
+		hashmap_remove(&pl->recursive_hashmap, translated, &data);
+		free(data);
+		return;
+	}
+
+	if (given->flags & PATTERN_FLAG_NEGATIVE) {
+		warning(_("unrecognized negative pattern: '%s'"),
+			given->pattern);
+		goto clear_hashmaps;
+	}
+
+	translated = xmalloc(sizeof(struct pattern_entry));
+
+	translated->pattern = xstrdup(given->pattern);
+	translated->patternlen = given->patternlen;
+	hashmap_entry_init(translated,
+			   memhash(translated->pattern, translated->patternlen));
+
+	hashmap_add(&pl->recursive_hashmap, translated);
+
+	if (hashmap_get(&pl->parent_hashmap, translated, NULL)) {
+		/* we already included this at the parent level */
+		warning(_("your sparse-checkout file may have issues: pattern '%s' is repeated"),
+			given->pattern);
+		hashmap_remove(&pl->parent_hashmap, translated, &data);
+		free(data);
+		free(translated);
+	}
+
+	return;
+
+clear_hashmaps:
+	warning(_("disabling cone pattern matching"));
+	hashmap_free(&pl->parent_hashmap, 1);
+	hashmap_free(&pl->recursive_hashmap, 1);
+	pl->use_cone_patterns = 0;
+}
+
+static int hashmap_contains_path(struct hashmap *map,
+				 struct strbuf *pattern)
+{
+	struct pattern_entry p;
+
+	/* Check straight mapping */
+	p.pattern = pattern->buf;
+	p.patternlen = pattern->len;
+	hashmap_entry_init(&p, memhash(p.pattern, p.patternlen));
+	return !!hashmap_get(map, &p, NULL);
+}
+
 void add_pattern(const char *string, const char *base,
 		 int baselen, struct pattern_list *pl, int srcpos)
 {
@@ -623,6 +726,8 @@ void add_pattern(const char *string, const char *base,
 	ALLOC_GROW(pl->patterns, pl->nr + 1, pl->alloc);
 	pl->patterns[pl->nr++] = pattern;
 	pattern->pl = pl;
+
+	add_pattern_to_hashsets(pl, pattern);
 }
 
 static int read_skip_worktree_file_from_index(const struct index_state *istate,
@@ -848,6 +953,10 @@ static int add_patterns_from_buffer(char *buf, size_t size,
 	int i, lineno = 1;
 	char *entry;
 
+	pl->use_cone_patterns = core_sparse_checkout_cone;
+	hashmap_init(&pl->recursive_hashmap, pl_hashmap_cmp, NULL, 0);
+	hashmap_init(&pl->parent_hashmap, pl_hashmap_cmp, NULL, 0);
+
 	pl->filebuf = buf;
 
 	if (skip_utf8_bom(&buf, size))
@@ -1084,16 +1193,64 @@ enum pattern_match_result path_matches_pattern_list(
 				struct index_state *istate)
 {
 	struct path_pattern *pattern;
-	pattern = last_matching_pattern_from_list(pathname, pathlen, basename,
-						  dtype, pl, istate);
-	if (pattern) {
-		if (pattern->flags & PATTERN_FLAG_NEGATIVE)
-			return NOT_MATCHED;
-		else
-			return MATCHED;
+	struct strbuf parent_pathname = STRBUF_INIT;
+	int result = NOT_MATCHED;
+	const char *slash_pos;
+
+	if (!pl->use_cone_patterns) {
+		pattern = last_matching_pattern_from_list(pathname, pathlen, basename,
+							dtype, pl, istate);
+		if (pattern) {
+			if (pattern->flags & PATTERN_FLAG_NEGATIVE)
+				return NOT_MATCHED;
+			else
+				return MATCHED;
+		}
+
+		return UNDECIDED;
 	}
 
-	return UNDECIDED;
+	strbuf_addch(&parent_pathname, '/');
+	strbuf_add(&parent_pathname, pathname, pathlen);
+
+	if (hashmap_contains_path(&pl->recursive_hashmap,
+					&parent_pathname)) {
+		result = MATCHED;
+		goto done;
+	}
+
+	slash_pos = strrchr(parent_pathname.buf, '/');
+
+	if (slash_pos == parent_pathname.buf) {
+		/* include every file in root */
+		result = MATCHED;
+		goto done;
+	}
+
+	strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf);
+
+	if (hashmap_contains_path(&pl->parent_hashmap, &parent_pathname)) {
+		result = MATCHED;
+		goto done;
+	}
+
+	while (parent_pathname.len) {
+		if (hashmap_contains_path(&pl->recursive_hashmap,
+					  &parent_pathname)) {
+			result = UNDECIDED;
+			goto done;
+		}
+
+		slash_pos = strrchr(parent_pathname.buf, '/');
+		if (slash_pos == parent_pathname.buf)
+			break;
+
+		strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf);
+	}
+
+done:
+	strbuf_release(&parent_pathname);
+	return result;
 }
 
 static struct path_pattern *last_matching_pattern_from_lists(
diff --git a/dir.h b/dir.h
index 608696c958..bbd5bd1cc9 100644
--- a/dir.h
+++ b/dir.h
@@ -4,6 +4,7 @@
 /* See Documentation/technical/api-directory-listing.txt */
 
 #include "cache.h"
+#include "hashmap.h"
 #include "strbuf.h"
 
 struct dir_entry {
@@ -37,6 +38,13 @@ struct path_pattern {
 	int srcpos;
 };
 
+/* used for hashmaps for cone patterns */
+struct pattern_entry {
+	struct hashmap_entry ent;
+	char *pattern;
+	size_t patternlen;
+};
+
 /*
  * Each excludes file will be parsed into a fresh exclude_list which
  * is appended to the relevant exclude_list_group (either EXC_DIRS or
@@ -55,6 +63,25 @@ struct pattern_list {
 	const char *src;
 
 	struct path_pattern **patterns;
+
+	/*
+	 * While scanning the excludes, we attempt to match the patterns
+	 * with a more restricted set that allows us to use hashsets for
+	 * matching logic, which is faster than the linear lookup in the
+	 * excludes array above. If non-zero, that check succeeded.
+	 */
+	unsigned use_cone_patterns;
+
+	/*
+	 * Stores paths where everything starting with those paths
+	 * is included.
+	 */
+	struct hashmap recursive_hashmap;
+
+	/*
+	 * Used to check single-level parents of blobs.
+	 */
+	struct hashmap parent_hashmap;
 };
 
 /*
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 9b089c98c4..f726205d21 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -143,7 +143,8 @@ test_expect_success 'set sparse-checkout using --stdin' '
 test_expect_success 'cone mode: match patterns' '
 	git -C repo config --worktree core.sparseCheckoutCone true &&
 	rm -rf repo/a repo/folder1 repo/folder2 &&
-	git -C repo read-tree -mu HEAD &&
+	git -C repo read-tree -mu HEAD 2>err &&
+	test_i18ngrep ! "disabling cone patterns" err &&
 	git -C repo reset --hard &&
 	ls repo >dir  &&
 	cat >expect <<-EOF &&
@@ -154,6 +155,14 @@ test_expect_success 'cone mode: match patterns' '
 	test_cmp expect dir
 '
 
+test_expect_success 'cone mode: warn on bad pattern' '
+	test_when_finished mv sparse-checkout repo/.git/info/ &&
+	cp repo/.git/info/sparse-checkout . &&
+	echo "!/deep/deeper/*" >>repo/.git/info/sparse-checkout &&
+	git -C repo read-tree -mu HEAD 2>err &&
+	test_i18ngrep "unrecognized negative pattern" err
+'
+
 test_expect_success 'sparse-checkout disable' '
 	git -C repo sparse-checkout disable &&
 	test_path_is_missing repo/.git/info/sparse-checkout &&
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v2 08/11] sparse-checkout: add 'cone' mode
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
                     ` (6 preceding siblings ...)
  2019-09-19 14:43   ` [PATCH v2 07/11] trace2: add region in clear_ce_flags Jeff Hostetler via GitGitGadget
@ 2019-09-19 14:43   ` Derrick Stolee via GitGitGadget
  2019-10-06  4:22     ` Elijah Newren
  2019-09-19 14:43   ` [PATCH v2 09/11] sparse-checkout: use hashmaps for cone patterns Derrick Stolee via GitGitGadget
                     ` (4 subsequent siblings)
  12 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-09-19 14:43 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The sparse-checkout feature can have quadratic performance as
the number of patterns and number of entries in the index grow.
If there are 1,000 patterns and 1,000,000 entries, this time can
be very significant.

Create a new Boolean config option, core.sparseCheckoutCone, to
indicate that we expect the sparse-checkout file to contain a
more limited set of patterns. This is a separate config setting
from core.sparseCheckout to avoid breaking older clients by
introcuding a tri-state option.

The config option does nothing right now, but will be expanded
upon in a later commit.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/config/core.txt         |  7 ++--
 Documentation/git-sparse-checkout.txt | 50 +++++++++++++++++++++++++++
 cache.h                               |  4 ++-
 config.c                              |  5 +++
 environment.c                         |  1 +
 t/t1091-sparse-checkout-builtin.sh    | 14 ++++++++
 6 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index 75538d27e7..9b8ab2a6d4 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -591,8 +591,11 @@ core.multiPackIndex::
 	multi-pack-index design document].
 
 core.sparseCheckout::
-	Enable "sparse checkout" feature. See section "Sparse checkout" in
-	linkgit:git-read-tree[1] for more information.
+	Enable "sparse checkout" feature. If "false", then sparse-checkout
+	is disabled. If "true", then sparse-checkout is enabled with the full
+	.gitignore pattern set. If "cone", then sparse-checkout is enabled with
+	a restricted pattern set. See linkgit:git-sparse-checkout[1] for more
+	information.
 
 core.abbrev::
 	Set the length object names are abbreviated to.  If
diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index da95b28b1c..757326618d 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -87,6 +87,56 @@ using negative patterns. For example, to remove the file `unwanted`:
 ----------------
 
 
+## CONE PATTERN SET
+
+The full pattern set allows for arbitrary pattern matches and complicated
+inclusion/exclusion rules. These can result in O(N*M) pattern matches when
+updating the index, where N is the number of patterns and M is the number
+of paths in the index. To combat this performance issue, a more restricted
+pattern set is allowed when `core.spareCheckoutCone` is enabled.
+
+The accepted patterns in the cone pattern set are:
+
+1. *Recursive:* All paths inside a directory are included.
+
+2. *Parent:* All files immediately inside a directory are included.
+
+In addition to the above two patterns, we also expect that all files in the
+root directory are included. If a recursive pattern is added, then all
+leading directories are added as parent patterns.
+
+By default, when running `git sparse-checkout init`, the root directory is
+added as a parent pattern. At this point, the sparse-checkout file contains
+the following patterns:
+
+```
+/*
+!/*/
+```
+
+This says "include everything in root, but nothing two levels below root."
+If we then add the folder `A/B/C` as a recursive pattern, the folders `A` and
+`A/B` are added as parent patterns. The resulting sparse-checkout file is
+now
+
+```
+/*
+!/*/
+/A/
+!/A/*/
+/A/B/
+!/A/B/*/
+/A/B/C/
+```
+
+Here, order matters, so the negative patterns are overridden by the positive
+patterns that appear lower in the file.
+
+If `core.sparseCheckoutCone=true`, then Git will parse the sparse-checkout file
+expecting patterns of these types. Git will warn if the patterns do not match.
+If the patterns do match the expected format, then Git will use faster hash-
+based algorithms to compute inclusion in the sparse-checkout.
+
 SEE ALSO
 --------
 
diff --git a/cache.h b/cache.h
index cf5d70c196..8e8ea67efa 100644
--- a/cache.h
+++ b/cache.h
@@ -911,12 +911,14 @@ extern char *git_replace_ref_base;
 
 extern int fsync_object_files;
 extern int core_preload_index;
-extern int core_apply_sparse_checkout;
 extern int precomposed_unicode;
 extern int protect_hfs;
 extern int protect_ntfs;
 extern const char *core_fsmonitor;
 
+int core_apply_sparse_checkout;
+int core_sparse_checkout_cone;
+
 /*
  * Include broken refs in all ref iterations, which will
  * generally choke dangerous operations rather than letting
diff --git a/config.c b/config.c
index 296a6d9cc4..f65c74f5b7 100644
--- a/config.c
+++ b/config.c
@@ -1329,6 +1329,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "core.sparsecheckoutcone")) {
+		core_sparse_checkout_cone = git_config_bool(var, value);
+		return 0;
+	}
+
 	if (!strcmp(var, "core.precomposeunicode")) {
 		precomposed_unicode = git_config_bool(var, value);
 		return 0;
diff --git a/environment.c b/environment.c
index 89af47cb85..670d92bcc0 100644
--- a/environment.c
+++ b/environment.c
@@ -69,6 +69,7 @@ enum object_creation_mode object_creation_mode = OBJECT_CREATION_MODE;
 char *notes_ref_name;
 int grafts_replace_parents = 1;
 int core_apply_sparse_checkout;
+int core_sparse_checkout_cone;
 int merge_log_config = -1;
 int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */
 unsigned long pack_size_limit_cfg;
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 22fa032d6d..9b089c98c4 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -140,6 +140,20 @@ test_expect_success 'set sparse-checkout using --stdin' '
 	test_cmp expect dir
 '
 
+test_expect_success 'cone mode: match patterns' '
+	git -C repo config --worktree core.sparseCheckoutCone true &&
+	rm -rf repo/a repo/folder1 repo/folder2 &&
+	git -C repo read-tree -mu HEAD &&
+	git -C repo reset --hard &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_expect_success 'sparse-checkout disable' '
 	git -C repo sparse-checkout disable &&
 	test_path_is_missing repo/.git/info/sparse-checkout &&
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v2 10/11] sparse-checkout: init and set in cone mode
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
                     ` (8 preceding siblings ...)
  2019-09-19 14:43   ` [PATCH v2 09/11] sparse-checkout: use hashmaps for cone patterns Derrick Stolee via GitGitGadget
@ 2019-09-19 14:43   ` Derrick Stolee via GitGitGadget
  2019-09-19 14:43   ` [PATCH v2 11/11] unpack-trees: hash less " Derrick Stolee via GitGitGadget
                     ` (2 subsequent siblings)
  12 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-09-19 14:43 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

To make the cone pattern set easy to use, update the behavior of
'git sparse-checkout [init|set]'.

Add '--cone' flag to 'git sparse-checkout init' to set the config
option 'core.sparseCheckoutCone=true'.

When running 'git sparse-checkout set' in cone mode, a user only
needs to supply a list of recursive folder matches. Git will
automatically add the necessary parent matches for the leading
directories.

When testing 'git sparse-checkout set' in cone mode, check the
error stream to ensure we do not see any errors. Specifically,
we want to avoid the warning that the patterns do not match
the cone-mode patterns.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 builtin/sparse-checkout.c          | 171 +++++++++++++++++++++++++++--
 dir.c                              |   4 +-
 dir.h                              |   3 +
 t/t1091-sparse-checkout-builtin.sh |  49 +++++++++
 4 files changed, 213 insertions(+), 14 deletions(-)

diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index f858f0b1b5..111cbc96d9 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -6,6 +6,7 @@
 #include "repository.h"
 #include "run-command.h"
 #include "strbuf.h"
+#include "string-list.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
 	N_("git sparse-checkout [init|list|set|disable] <options>"),
@@ -74,9 +75,14 @@ static int update_working_directory(void)
 	return result;
 }
 
+#define SPARSE_CHECKOUT_NONE 0
+#define SPARSE_CHECKOUT_FULL 1
+#define SPARSE_CHECKOUT_CONE 2
+
 static int sc_set_config(int mode)
 {
 	struct argv_array argv = ARGV_ARRAY_INIT;
+	struct argv_array cone_argv = ARGV_ARRAY_INIT;
 
 	if (git_config_set_gently("extensions.worktreeConfig", "true")) {
 		error(_("failed to set extensions.worktreeConfig setting"));
@@ -95,9 +101,31 @@ static int sc_set_config(int mode)
 		return 1;
 	}
 
+	argv_array_pushl(&cone_argv, "config", "--worktree",
+			 "core.sparseCheckoutCone", NULL);
+
+	if (mode == SPARSE_CHECKOUT_CONE)
+		argv_array_push(&cone_argv, "true");
+	else
+		argv_array_push(&cone_argv, "false");
+
+	if (run_command_v_opt(cone_argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to enable core.sparseCheckoutCone"));
+		return 1;
+	}
+
 	return 0;
 }
 
+static char const * const builtin_sparse_checkout_init_usage[] = {
+	N_("git sparse-checkout init [--cone]"),
+	NULL
+};
+
+static struct sparse_checkout_init_opts {
+	int cone_mode;
+} init_opts;
+
 static int sparse_checkout_init(int argc, const char **argv)
 {
 	struct pattern_list pl;
@@ -105,8 +133,21 @@ static int sparse_checkout_init(int argc, const char **argv)
 	FILE *fp;
 	int res;
 	struct object_id oid;
+	int mode;
+
+	static struct option builtin_sparse_checkout_init_options[] = {
+		OPT_BOOL(0, "cone", &init_opts.cone_mode,
+			 N_("initialize the sparse-checkout in cone mode")),
+		OPT_END(),
+	};
 
-	if (sc_set_config(1))
+	argc = parse_options(argc, argv, NULL,
+			     builtin_sparse_checkout_init_options,
+			     builtin_sparse_checkout_init_usage, 0);
+
+	mode = init_opts.cone_mode ? SPARSE_CHECKOUT_CONE : SPARSE_CHECKOUT_FULL;
+
+	if (sc_set_config(mode))
 		return 1;
 
 	memset(&pl, 0, sizeof(pl));
@@ -135,6 +176,72 @@ static int sparse_checkout_init(int argc, const char **argv)
 	return update_working_directory();
 }
 
+static void insert_recursive_pattern(struct pattern_list *pl, struct strbuf *path)
+{
+	struct pattern_entry *e = xmalloc(sizeof(struct pattern_entry));
+	e->patternlen = path->len;
+	e->pattern = strbuf_detach(path, NULL);
+	hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
+
+	hashmap_add(&pl->recursive_hashmap, e);
+
+	while (e->patternlen) {
+		char *slash = strrchr(e->pattern, '/');
+		char *oldpattern = e->pattern;
+		size_t newlen;
+
+		if (!slash)
+			break;
+
+		newlen = slash - e->pattern;
+		e = xmalloc(sizeof(struct pattern_entry));
+		e->patternlen = newlen;
+		e->pattern = xstrndup(oldpattern, newlen);
+		hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
+
+		if (!hashmap_get(&pl->parent_hashmap, e, NULL))
+			hashmap_add(&pl->parent_hashmap, e);
+	}
+}
+
+static void write_cone_to_file(FILE *fp, struct pattern_list *pl)
+{
+	int i;
+	struct pattern_entry *entry;
+	struct hashmap_iter iter;
+	struct string_list sl = STRING_LIST_INIT_DUP;
+
+	hashmap_iter_init(&pl->parent_hashmap, &iter);
+	while ((entry = hashmap_iter_next(&iter)))
+		string_list_insert(&sl, entry->pattern);
+
+	string_list_sort(&sl);
+	string_list_remove_duplicates(&sl, 0);
+
+	fprintf(fp, "/*\n!/*/\n");
+
+	for (i = 0; i < sl.nr; i++) {
+		char *pattern = sl.items[i].string;
+
+		if (strlen(pattern))
+			fprintf(fp, "/%s/\n!/%s/*/\n", pattern, pattern);
+	}
+
+	string_list_clear(&sl, 0);
+
+	hashmap_iter_init(&pl->recursive_hashmap, &iter);
+	while ((entry = hashmap_iter_next(&iter)))
+		string_list_insert(&sl, entry->pattern);
+
+	string_list_sort(&sl);
+	string_list_remove_duplicates(&sl, 0);
+
+	for (i = 0; i < sl.nr; i++) {
+		char *pattern = sl.items[i].string;
+		fprintf(fp, "/%s/\n", pattern);
+	}
+}
+
 static int write_patterns_and_update(struct pattern_list *pl)
 {
 	char *sparse_filename;
@@ -142,7 +249,12 @@ static int write_patterns_and_update(struct pattern_list *pl)
 
 	sparse_filename = get_sparse_checkout_filename();
 	fp = fopen(sparse_filename, "w");
-	write_patterns_to_file(fp, pl);
+
+	if (core_sparse_checkout_cone)
+		write_cone_to_file(fp, pl);
+	else
+		write_patterns_to_file(fp, pl);
+
 	fclose(fp);
 	free(sparse_filename);
 
@@ -150,6 +262,24 @@ static int write_patterns_and_update(struct pattern_list *pl)
 	return update_working_directory();
 }
 
+static void strbuf_to_cone_pattern(struct strbuf *line, struct pattern_list *pl)
+{
+	strbuf_trim(line);
+
+	strbuf_trim_trailing_dir_sep(line);
+
+	if (!line->len)
+		return;
+
+	if (line->buf[0] == '/')
+		strbuf_remove(line, 0, 1);
+
+	if (!line->len)
+		return;
+
+	insert_recursive_pattern(pl, line);
+}
+
 static char const * const builtin_sparse_checkout_set_usage[] = {
 	N_("git sparse-checkout set [--stdin|<patterns>]"),
 	NULL
@@ -177,17 +307,34 @@ static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
 			     builtin_sparse_checkout_set_usage,
 			     PARSE_OPT_KEEP_UNKNOWN);
 
-	if (set_opts.use_stdin) {
+	if (core_sparse_checkout_cone) {
 		struct strbuf line = STRBUF_INIT;
-
-		while (!strbuf_getline(&line, stdin)) {
-			size_t len;
-			char *buf = strbuf_detach(&line, &len);
-			add_pattern(buf, buf, len, &pl, 0);
+		hashmap_init(&pl.recursive_hashmap, pl_hashmap_cmp, NULL, 0);
+		hashmap_init(&pl.parent_hashmap, pl_hashmap_cmp, NULL, 0);
+
+		if (set_opts.use_stdin) {
+			while (!strbuf_getline(&line, stdin))
+				strbuf_to_cone_pattern(&line, &pl);
+		} else {
+			for (i = 0; i < argc; i++) {
+				strbuf_setlen(&line, 0);
+				strbuf_addstr(&line, argv[i]);
+				strbuf_to_cone_pattern(&line, &pl);
+			}
 		}
 	} else {
-		for (i = 0; i < argc; i++)
-			add_pattern(argv[i], argv[i], strlen(argv[i]), &pl, 0);
+		if (set_opts.use_stdin) {
+			struct strbuf line = STRBUF_INIT;
+
+			while (!strbuf_getline(&line, stdin)) {
+				size_t len;
+				char *buf = strbuf_detach(&line, &len);
+				add_pattern(buf, buf, len, &pl, 0);
+			}
+		} else {
+			for (i = 0; i < argc; i++)
+				add_pattern(argv[i], argv[i], strlen(argv[i]), &pl, 0);
+		}
 	}
 
 	return write_patterns_and_update(&pl);
@@ -198,7 +345,7 @@ static int sparse_checkout_disable(int argc, const char **argv)
 	char *sparse_filename;
 	FILE *fp;
 
-	if (sc_set_config(1))
+	if (sc_set_config(SPARSE_CHECKOUT_FULL))
 		die(_("failed to change config"));
 
 	sparse_filename = get_sparse_checkout_filename();
@@ -212,7 +359,7 @@ static int sparse_checkout_disable(int argc, const char **argv)
 	unlink(sparse_filename);
 	free(sparse_filename);
 
-	return sc_set_config(0);
+	return sc_set_config(SPARSE_CHECKOUT_NONE);
 }
 
 int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
diff --git a/dir.c b/dir.c
index 4fc57187e9..298a4539ec 100644
--- a/dir.c
+++ b/dir.c
@@ -599,8 +599,8 @@ void parse_path_pattern(const char **pattern,
 	*patternlen = len;
 }
 
-static int pl_hashmap_cmp(const void *unused_cmp_data,
-			  const void *a, const void *b, const void *key)
+int pl_hashmap_cmp(const void *unused_cmp_data,
+		   const void *a, const void *b, const void *key)
 {
 	const struct pattern_entry *ee1 = (const struct pattern_entry *)a;
 	const struct pattern_entry *ee2 = (const struct pattern_entry *)b;
diff --git a/dir.h b/dir.h
index bbd5bd1cc9..7c76a2d55e 100644
--- a/dir.h
+++ b/dir.h
@@ -296,6 +296,9 @@ int is_excluded(struct dir_struct *dir,
 		struct index_state *istate,
 		const char *name, int *dtype);
 
+int pl_hashmap_cmp(const void *unused_cmp_data,
+		   const void *a, const void *b, const void *key);
+
 struct pattern_list *add_pattern_list(struct dir_struct *dir,
 				      int group_type, const char *src);
 int add_patterns_from_file_to_list(const char *fname, const char *base, int baselen,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index f726205d21..b6eb02c69a 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -178,5 +178,54 @@ test_expect_success 'sparse-checkout disable' '
 	test_cmp expect dir
 '
 
+test_expect_success 'cone mode: init and set' '
+	git -C repo sparse-checkout init --cone &&
+	git -C repo config --list >config &&
+	test_i18ngrep "core.sparsecheckoutcone=true" config &&
+	ls repo >dir  &&
+	echo a >expect &&
+	test_cmp expect dir &&
+	git -C repo sparse-checkout set deep/deeper1/deepest/ 2>err &&
+	test_line_count = 0 err &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		deep
+	EOF
+	ls repo/deep >dir  &&
+	cat >expect <<-EOF &&
+		a
+		deeper1
+	EOF
+	ls repo/deep/deeper1 >dir  &&
+	cat >expect <<-EOF &&
+		a
+		deepest
+	EOF
+	test_cmp expect dir &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+		/deep/
+		!/deep/*/
+		/deep/deeper1/
+		!/deep/deeper1/*/
+		/deep/deeper1/deepest/
+	EOF
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	git -C repo sparse-checkout set --stdin 2>err <<-EOF &&
+		folder1
+		folder2
+	EOF
+	test_line_count = 0 err &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	ls repo >dir &&
+	test_cmp expect dir
+'
+
 test_done
 
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v2 11/11] unpack-trees: hash less in cone mode
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
                     ` (9 preceding siblings ...)
  2019-09-19 14:43   ` [PATCH v2 10/11] sparse-checkout: init and set in cone mode Derrick Stolee via GitGitGadget
@ 2019-09-19 14:43   ` " Derrick Stolee via GitGitGadget
  2019-10-01 13:40   ` [PATCH v2 00/11] New sparse-checkout builtin and "cone" mode Derrick Stolee
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
  12 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-09-19 14:43 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The sparse-checkout feature in "cone mode" can use the fact that
the recursive patterns are "connected" to the root via parent
patterns to decide if a directory is entirely contained in the
sparse-checkout or entirely removed.

In these cases, we can skip hashing the paths within those
directories and simply set the skipworktree bit to the correct
value.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 dir.c          |  4 ++--
 dir.h          |  1 +
 unpack-trees.c | 38 +++++++++++++++++++++++---------------
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/dir.c b/dir.c
index 298a4539ec..35fd60d487 100644
--- a/dir.c
+++ b/dir.c
@@ -1215,7 +1215,7 @@ enum pattern_match_result path_matches_pattern_list(
 
 	if (hashmap_contains_path(&pl->recursive_hashmap,
 					&parent_pathname)) {
-		result = MATCHED;
+		result = MATCHED_RECURSIVE;
 		goto done;
 	}
 
@@ -1237,7 +1237,7 @@ enum pattern_match_result path_matches_pattern_list(
 	while (parent_pathname.len) {
 		if (hashmap_contains_path(&pl->recursive_hashmap,
 					  &parent_pathname)) {
-			result = UNDECIDED;
+			result = MATCHED_RECURSIVE;
 			goto done;
 		}
 
diff --git a/dir.h b/dir.h
index 7c76a2d55e..5f410eedbb 100644
--- a/dir.h
+++ b/dir.h
@@ -261,6 +261,7 @@ enum pattern_match_result {
 	UNDECIDED = -1,
 	NOT_MATCHED = 0,
 	MATCHED = 1,
+	MATCHED_RECURSIVE = 2,
 };
 
 /*
diff --git a/unpack-trees.c b/unpack-trees.c
index 26be8f3569..43acc0ffd6 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1280,15 +1280,17 @@ static int clear_ce_flags_dir(struct index_state *istate,
 	struct cache_entry **cache_end;
 	int dtype = DT_DIR;
 	int rc;
-	enum pattern_match_result ret;
-	ret = path_matches_pattern_list(prefix->buf, prefix->len,
-					basename, &dtype, pl, istate);
+	enum pattern_match_result ret, orig_ret;
+	orig_ret = path_matches_pattern_list(prefix->buf, prefix->len,
+					     basename, &dtype, pl, istate);
 
 	strbuf_addch(prefix, '/');
 
 	/* If undecided, use matching result of parent dir in defval */
-	if (ret == UNDECIDED)
+	if (orig_ret == UNDECIDED)
 		ret = default_match;
+	else
+		ret = orig_ret;
 
 	for (cache_end = cache; cache_end != cache + nr; cache_end++) {
 		struct cache_entry *ce = *cache_end;
@@ -1296,17 +1298,23 @@ static int clear_ce_flags_dir(struct index_state *istate,
 			break;
 	}
 
-	/*
-	 * TODO: check pl, if there are no patterns that may conflict
-	 * with ret (iow, we know in advance the incl/excl
-	 * decision for the entire directory), clear flag here without
-	 * calling clear_ce_flags_1(). That function will call
-	 * the expensive path_matches_pattern_list() on every entry.
-	 */
-	rc = clear_ce_flags_1(istate, cache, cache_end - cache,
-			      prefix,
-			      select_mask, clear_mask,
-			      pl, ret);
+	if (pl->use_cone_patterns && orig_ret == MATCHED_RECURSIVE) {
+		struct cache_entry **ce = cache;
+		rc = (cache_end - cache) / sizeof(struct cache_entry *);
+
+		while (ce < cache_end) {
+			(*ce)->ce_flags &= ~clear_mask;
+			ce++;
+		}
+	} else if (pl->use_cone_patterns && orig_ret == NOT_MATCHED) {
+		rc = (cache_end - cache) / sizeof(struct cache_entry *);
+	} else {
+		rc = clear_ce_flags_1(istate, cache, cache_end - cache,
+				      prefix,
+				      select_mask, clear_mask,
+				      pl, ret);
+	}
+
 	strbuf_setlen(prefix, prefix->len - 1);
 	return rc;
 }
-- 
gitgitgadget

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 09/11] sparse-checkout: use hashmaps for cone patterns
  2019-09-19 14:43   ` [PATCH v2 09/11] sparse-checkout: use hashmaps for cone patterns Derrick Stolee via GitGitGadget
@ 2019-09-19 20:59     ` Derrick Stolee
  2019-09-20 14:37       ` Derrick Stolee
  0 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee @ 2019-09-19 20:59 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget, git; +Cc: Junio C Hamano, Derrick Stolee

On 9/19/2019 10:43 AM, Derrick Stolee via GitGitGadget wrote:
> @@ -848,6 +953,10 @@ static int add_patterns_from_buffer(char *buf, size_t size,
>  	int i, lineno = 1;
>  	char *entry;
>  
> +	pl->use_cone_patterns = core_sparse_checkout_cone;
> +	hashmap_init(&pl->recursive_hashmap, pl_hashmap_cmp, NULL, 0);
> +	hashmap_init(&pl->parent_hashmap, pl_hashmap_cmp, NULL, 0);
> +

Just a head's-up to anyone looking at this series: this is not the
right place to set use_cone_patterns (without passing a flag or
something). This same path is called from the .gitignore machinery,
so if you have a non-cone pattern in your .gitignore you will start
seeing warnings with core.sparseCheckoutCone=true.

I figured it out only via integration tests with our C# layer. In
v2 I'll fix this and add a test to make sure it stays fixed.

Otherwise, everything is working as expected.

-Stolee


^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 09/11] sparse-checkout: use hashmaps for cone patterns
  2019-09-19 20:59     ` Derrick Stolee
@ 2019-09-20 14:37       ` Derrick Stolee
  0 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-09-20 14:37 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget, git; +Cc: Junio C Hamano, Derrick Stolee

On 9/19/2019 4:59 PM, Derrick Stolee wrote:
> On 9/19/2019 10:43 AM, Derrick Stolee via GitGitGadget wrote:
>> @@ -848,6 +953,10 @@ static int add_patterns_from_buffer(char *buf, size_t size,
>>  	int i, lineno = 1;
>>  	char *entry;
>>  
>> +	pl->use_cone_patterns = core_sparse_checkout_cone;
>> +	hashmap_init(&pl->recursive_hashmap, pl_hashmap_cmp, NULL, 0);
>> +	hashmap_init(&pl->parent_hashmap, pl_hashmap_cmp, NULL, 0);
>> +
> 
> Just a head's-up to anyone looking at this series: this is not the
> right place to set use_cone_patterns (without passing a flag or
> something). This same path is called from the .gitignore machinery,
> so if you have a non-cone pattern in your .gitignore you will start
> seeing warnings with core.sparseCheckoutCone=true.
> 
> I figured it out only via integration tests with our C# layer. In
> v2 I'll fix this and add a test to make sure it stays fixed.

Here is the code fix. I will have a test to check this in v3.

-->8--

From 73b100d11d11bf8f045c2e116390120819dcb800 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <dstolee@microsoft.com>
Date: Fri, 20 Sep 2019 08:55:06 -0400
Subject: [PATCH v2] fixup! sparse-checkout: use hashmaps for cone patterns

---
 dir.c          | 1 -
 unpack-trees.c | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/dir.c b/dir.c
index 35fd60d487..248a418379 100644
--- a/dir.c
+++ b/dir.c
@@ -953,7 +953,6 @@ static int add_patterns_from_buffer(char *buf, size_t size,
 	int i, lineno = 1;
 	char *entry;
 
-	pl->use_cone_patterns = core_sparse_checkout_cone;
 	hashmap_init(&pl->recursive_hashmap, pl_hashmap_cmp, NULL, 0);
 	hashmap_init(&pl->parent_hashmap, pl_hashmap_cmp, NULL, 0);
 
diff --git a/unpack-trees.c b/unpack-trees.c
index 43acc0ffd6..b5cf591c38 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1487,6 +1487,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 		o->skip_sparse_checkout = 1;
 	if (!o->skip_sparse_checkout) {
 		char *sparse = git_pathdup("info/sparse-checkout");
+		pl.use_cone_patterns = core_sparse_checkout_cone;
 		if (add_patterns_from_file_to_list(sparse, "", 0, &pl, NULL) < 0)
 			o->skip_sparse_checkout = 1;
 		else
-- 
2.23.0.vfs.1.1.19.gce6e76d



^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 00/11] New sparse-checkout builtin and "cone" mode
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
                     ` (10 preceding siblings ...)
  2019-09-19 14:43   ` [PATCH v2 11/11] unpack-trees: hash less " Derrick Stolee via GitGitGadget
@ 2019-10-01 13:40   ` Derrick Stolee
  2019-10-01 16:54     ` Elijah Newren
  2019-10-03 22:28     ` Junio C Hamano
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
  12 siblings, 2 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-10-01 13:40 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget, git; +Cc: Junio C Hamano

On 9/19/2019 10:43 AM, Derrick Stolee via GitGitGadget wrote:
> This series makes the sparse-checkout feature more user-friendly. While
> there, I also present a way to use a limited set of patterns to gain a
> significant performance boost in very large repositories.
> 
> Sparse-checkout is only documented as a subsection of the read-tree docs
> [1], which makes the feature hard to discover. Users have trouble navigating
> the feature, especially at clone time [2], and have even resorted to
> creating their own helper tools [3].
> 
> This series attempts to solve these problems using a new builtin.

I haven't heard anything about this series since Elijah's careful
review of the RFC. There are definitely areas where this can be
made more robust, but I'd like to save those for a follow-up series.

Junio: I know you didn't track this in the recent "what's cooking"
list, and I don't expect you to take it until I re-roll v3 to
include the .gitignore interaction I already pointed out.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 00/11] New sparse-checkout builtin and "cone" mode
  2019-10-01 13:40   ` [PATCH v2 00/11] New sparse-checkout builtin and "cone" mode Derrick Stolee
@ 2019-10-01 16:54     ` Elijah Newren
  2019-10-01 18:15       ` Derrick Stolee
  2019-10-03 22:28     ` Junio C Hamano
  1 sibling, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-10-01 16:54 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Derrick Stolee via GitGitGadget, Git Mailing List, Junio C Hamano

On Tue, Oct 1, 2019 at 9:48 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 9/19/2019 10:43 AM, Derrick Stolee via GitGitGadget wrote:
> > This series makes the sparse-checkout feature more user-friendly. While
> > there, I also present a way to use a limited set of patterns to gain a
> > significant performance boost in very large repositories.
> >
> > Sparse-checkout is only documented as a subsection of the read-tree docs
> > [1], which makes the feature hard to discover. Users have trouble navigating
> > the feature, especially at clone time [2], and have even resorted to
> > creating their own helper tools [3].
> >
> > This series attempts to solve these problems using a new builtin.
>
> I haven't heard anything about this series since Elijah's careful
> review of the RFC. There are definitely areas where this can be
> made more robust, but I'd like to save those for a follow-up series.
>
> Junio: I know you didn't track this in the recent "what's cooking"
> list, and I don't expect you to take it until I re-roll v3 to
> include the .gitignore interaction I already pointed out.

Oh, sorry, I missed this.  By the way, is there any reason I wasn't
cc'ed on this round after reviewing the RFC?

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 00/11] New sparse-checkout builtin and "cone" mode
  2019-10-01 16:54     ` Elijah Newren
@ 2019-10-01 18:15       ` Derrick Stolee
  0 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-10-01 18:15 UTC (permalink / raw)
  To: Elijah Newren
  Cc: Derrick Stolee via GitGitGadget, Git Mailing List, Junio C Hamano

On 10/1/2019 12:54 PM, Elijah Newren wrote:
> On Tue, Oct 1, 2019 at 9:48 AM Derrick Stolee <stolee@gmail.com> wrote:
>>
>> On 9/19/2019 10:43 AM, Derrick Stolee via GitGitGadget wrote:
>>> This series makes the sparse-checkout feature more user-friendly. While
>>> there, I also present a way to use a limited set of patterns to gain a
>>> significant performance boost in very large repositories.
>>>
>>> Sparse-checkout is only documented as a subsection of the read-tree docs
>>> [1], which makes the feature hard to discover. Users have trouble navigating
>>> the feature, especially at clone time [2], and have even resorted to
>>> creating their own helper tools [3].
>>>
>>> This series attempts to solve these problems using a new builtin.
>>
>> I haven't heard anything about this series since Elijah's careful
>> review of the RFC. There are definitely areas where this can be
>> made more robust, but I'd like to save those for a follow-up series.
>>
>> Junio: I know you didn't track this in the recent "what's cooking"
>> list, and I don't expect you to take it until I re-roll v3 to
>> include the .gitignore interaction I already pointed out.
> 
> Oh, sorry, I missed this.  By the way, is there any reason I wasn't
> cc'ed on this round after reviewing the RFC?

Sorry, I forgot to modify my GitGitGadget cover letter to include you
as a CC. Totally my oversight, not on purpose.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 00/11] New sparse-checkout builtin and "cone" mode
  2019-10-01 13:40   ` [PATCH v2 00/11] New sparse-checkout builtin and "cone" mode Derrick Stolee
  2019-10-01 16:54     ` Elijah Newren
@ 2019-10-03 22:28     ` Junio C Hamano
  1 sibling, 0 replies; 196+ messages in thread
From: Junio C Hamano @ 2019-10-03 22:28 UTC (permalink / raw)
  To: Derrick Stolee; +Cc: Derrick Stolee via GitGitGadget, git

Derrick Stolee <stolee@gmail.com> writes:

> On 9/19/2019 10:43 AM, Derrick Stolee via GitGitGadget wrote:
>> This series makes the sparse-checkout feature more user-friendly. While
>> there, I also present a way to use a limited set of patterns to gain a
>> significant performance boost in very large repositories.
>> 
>> Sparse-checkout is only documented as a subsection of the read-tree docs
>> [1], which makes the feature hard to discover. Users have trouble navigating
>> the feature, especially at clone time [2], and have even resorted to
>> creating their own helper tools [3].
>> 
>> This series attempts to solve these problems using a new builtin.
>
> I haven't heard anything about this series since Elijah's careful
> review of the RFC. There are definitely areas where this can be
> made more robust, but I'd like to save those for a follow-up series.
>
> Junio: I know you didn't track this in the recent "what's cooking"
> list, and I don't expect you to take it until I re-roll v3 to
> include the .gitignore interaction I already pointed out.

I have made a mental note that says "expecting v3, a
reroll. cf. <7d87fe4b-160c-34c2-db6d-4a56fd919755@gmail.com>"; there
is no existing entry to hang it below in the "what's cooking"
report, though X-<.



^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 01/11] sparse-checkout: create builtin with 'list' subcommand
  2019-09-19 14:43   ` [PATCH v2 01/11] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-05 19:22     ` Elijah Newren
  0 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-10-05 19:22 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Thu, Sep 19, 2019 at 1:45 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> The sparse-checkout feature is mostly hidden to users, as its
> only documentation is supplementary information in the docs for
> 'git read-tree'. In addition, users need to know how to edit the
> .git/info/sparse-checkout file with the right patterns, then run
> the appropriate 'git read-tree -mu HEAD' command. Keeping the
> working directory in sync with the sparse-checkout file requires
> care.
>
> Begin an effort to make the sparse-checkout feature a porcelain
> feature by creating a new 'git sparse-checkout' builtin. This
> builtin will be the preferred mechanism for manipulating the
> sparse-checkout file and syncing the working directory.

Sounds good.

> The `$GIT_DIR/info/sparse-checkout` file defines the skip-
> worktree reference bitmap. When Git updates the working
> directory, it updates the skip-worktree bits in the index
> based on this file and removes or restores files in the
> working copy to match.

Does this paragraph make sense in the commit message?  It's not
explaining anything new or changing with your patch, just pre-existing
behavior, but you don't seem to reference or expound on it.

> The documentation provided is adapted from the "git read-tree"
> documentation with a few edits for clarity in the new context.
> Extra sections are added to hint toward a future change to
> a more restricted pattern set.

I think it needs a few more adaptations, as noted below...

> +SPARSE CHECKOUT
> +----------------
> +
> +"Sparse checkout" allows populating the working directory sparsely.
> +It uses the skip-worktree bit (see linkgit:git-update-index[1]) to tell
> +Git whether a file in the working directory is worth looking at. If
> +the skip-worktree bit is set, then the file is ignored in the working
> +directory. Git will not populate the contents of those files, which
> +makes a sparse checkout helpful when working in a repository with many
> +files, but only a few are important to the current user.
> +
> +The `$GIT_DIR/info/sparse-checkout` file is used to define the
> +skip-worktree reference bitmap. When Git updates the working
> +directory, it resets the skip-worktree bit in the index based on this
> +file. If an entry
> +matches a pattern in this file, skip-worktree will not be set on
> +that entry. Otherwise, skip-worktree will be set.
> +
> +Then it compares the new skip-worktree value with the previous one. If
> +skip-worktree turns from set to unset, it will add the corresponding
> +file back. If it turns from unset to set, that file will be removed.

I know this was just copied from elsewhere, but I still have the same
problem I mentioned last time with these paragraphs: the double
negations just make it confusing to follow.  I'd prefer e.g. replacing
the last two paragraphs above with the following (which I think you
did take but accidentally placed in the commit message instead of
using it to replace these confusing paragraphs?):

The `$GIT_DIR/info/sparse-checkout` file is used to define the
skip-worktree reference bitmap. When Git updates the working
directory, it updates the skip-worktree bits in the index based on this
file and removes or restores files in the working copy to match.

It doesn't have to be this precise wording, but something like it
which is way easier to follow than those two paragraphs you were
copying.

> +Another tricky thing is fully repopulating the working directory when you
> +no longer want sparse checkout. You cannot just disable "sparse
> +checkout" because skip-worktree bits are still in the index and your working
> +directory is still sparsely populated. You should re-populate the working
> +directory with the `$GIT_DIR/info/sparse-checkout` file content as
> +follows:
> +
> +----------------
> +/*
> +----------------
> +
> +Then you can disable sparse checkout.

I would comment on this section, but it appears you remove this
section later in your series when you add 'sparse-checkout disable',
which addresses my concern.

> Sparse checkout support in 'git
> +read-tree' and similar commands is disabled by default. You need to
> +set `core.sparseCheckout` to `true` in order to have sparse checkout
> +support.

I see you change `git read-tree` to `git checkout` later in the
series, which is good.  However, you keep the second sentence which
seems unhelpful.  Why have a 'git sparse-checkout init' command if the
user still has to manually set `core.sparseCheckout`?  Also, if we're
going to mention that setting, we should mention
extensions.worktreeConfig at the same time.  Not sure whether it'd be
better to drop the second sentence or restructure it to let the user
know that it depends on the core.sparseCheckout setting which the init
command runs, but something should probably be done.


The rest of the patch looks good.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 02/11] sparse-checkout: create 'init' subcommand
  2019-09-19 14:43   ` [PATCH v2 02/11] sparse-checkout: create 'init' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-05 19:34     ` Elijah Newren
  0 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-10-05 19:34 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Thu, Sep 19, 2019 at 3:06 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> Getting started with a sparse-checkout file can be daunting. Help
> users start their sparse enlistment using 'git sparse-checkout init'.
> This will set 'core.sparseCheckout=true' in their config, write
> an initial set of patterns to the sparse-checkout file, and update
> their working directory.

...and ensure extensions.worktreeConfig is set to true.

> Using 'git read-tree' to clear directories does not work cleanly
> on Windows, so manually delete directories that are tracked by Git
> before running read-tree.

I thought you said you fixed this?  It appears to no longer be part of
the patch, so I'm guessing you just forgot to remove this comment from
the commit message?

> The use of running another process for 'git read-tree' is likely
> suboptimal, but that can be improved in a later change, if valuable.

I think it would also be worth mentioning that not only is a
subprocess suboptimal, but the behavior of `git read-tree -mu HEAD` is
itself suboptimal for a sparse-checkout.  (We either need more error
checking e.g. when the user is in the middle of a rebase or merge or
cherry-pick and have conflicted entries with a more focused error
message for the user, or we need a command that won't abort if the
conflicts aren't in the paths we're trying to remove from or bring
back to the working tree.)


Patch looks good to me, assuming the caveats of using `git read-tree
-mu HEAD` are better documented -- and hopefully addressed at some
point.  You addressed all my other feedback on this patch from the RFC
series.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 03/11] clone: add --sparse mode
  2019-09-19 14:43   ` [PATCH v2 03/11] clone: add --sparse mode Derrick Stolee via GitGitGadget
@ 2019-10-05 19:40     ` Elijah Newren
  2019-10-07 13:56       ` Derrick Stolee
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-10-05 19:40 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Thu, Sep 19, 2019 at 3:06 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:

> During the 'git sparse-checkout init' call, we must first look
> to see if HEAD is valid, or else we will fail while trying to
> update the working directory. The first checkout will actually
> update the working directory correctly.

This is new since the RFC series, but I'm not sure I understand.  Is
the issue you're fixing here that a 'git init somerepo' would hit this
codepath and print funny errors because HEAD doesn't exist yet and
thus the whole `git read-tree -mu HEAD` stuff can't work?  Or that
when the remote has HEAD pointing at a bad commit that you get error
messages different than expected?

> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
> index 895479970d..656e6ebdd5 100644
> --- a/builtin/sparse-checkout.c
> +++ b/builtin/sparse-checkout.c
> @@ -99,6 +99,7 @@ static int sparse_checkout_init(int argc, const char **argv)
>         char *sparse_filename;
>         FILE *fp;
>         int res;
> +       struct object_id oid;
>
>         if (sc_enable_config())
>                 return 1;
> @@ -120,6 +121,11 @@ static int sparse_checkout_init(int argc, const char **argv)
>         fprintf(fp, "/*\n!/*/\n");
>         fclose(fp);
>
> +       if (get_oid("HEAD", &oid)) {
> +               /* assume we are in a fresh repo */
> +               return 0;
> +       }
> +
>  reset_dir:
>         return update_working_directory();
>  }

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 04/11] sparse-checkout: 'set' subcommand
  2019-09-19 14:43   ` [PATCH v2 04/11] sparse-checkout: 'set' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-05 22:44     ` Elijah Newren
  2019-10-06  0:30       ` Elijah Newren
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-10-05 22:44 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Thu, Sep 19, 2019 at 3:07 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
> +static int write_patterns_and_update(struct pattern_list *pl)
> +{
> +       char *sparse_filename;
> +       FILE *fp;
> +
> +       sparse_filename = get_sparse_checkout_filename();
> +       fp = fopen(sparse_filename, "w");
> +       write_patterns_to_file(fp, pl);
> +       fclose(fp);
> +       free(sparse_filename);
> +
> +       clear_pattern_list(pl);

It seems slightly odd that pl is passed in but cleared in this
function rather than in the caller that created pl.  Should this be
moved to the caller, or, alternatively, a comment added to explain
this side-effect for future callers of the function?

The rest of the patch looked good to me.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 04/11] sparse-checkout: 'set' subcommand
  2019-10-05 22:44     ` Elijah Newren
@ 2019-10-06  0:30       ` Elijah Newren
  2019-10-07 18:26         ` Derrick Stolee
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-10-06  0:30 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Sat, Oct 5, 2019 at 3:44 PM Elijah Newren <newren@gmail.com> wrote:
>
> On Thu, Sep 19, 2019 at 3:07 PM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
> > +static int write_patterns_and_update(struct pattern_list *pl)
> > +{
> > +       char *sparse_filename;
> > +       FILE *fp;
> > +
> > +       sparse_filename = get_sparse_checkout_filename();
> > +       fp = fopen(sparse_filename, "w");
> > +       write_patterns_to_file(fp, pl);
> > +       fclose(fp);
> > +       free(sparse_filename);
> > +
> > +       clear_pattern_list(pl);
>
> It seems slightly odd that pl is passed in but cleared in this
> function rather than in the caller that created pl.  Should this be
> moved to the caller, or, alternatively, a comment added to explain
> this side-effect for future callers of the function?
>
> The rest of the patch looked good to me.

Actually, thought of something else.  What if the user calls 'git
sparse-checkout set ...' without first calling 'git sparse-checkout
init'?  Should that report an error to the user, a suggestion to
follow it up with 'sparse-checkout init', or should it just call
sc_set_config() behind the scenes and allow bypassing the init
subcommand?

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 06/11] sparse-checkout: create 'disable' subcommand
  2019-09-19 14:43   ` [PATCH v2 06/11] sparse-checkout: create 'disable' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-06  4:10     ` Elijah Newren
  2019-10-07 19:12       ` Derrick Stolee
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-10-06  4:10 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Thu, Sep 19, 2019 at 1:46 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> The instructions for disabling a sparse-checkout to a full
> working directory are complicated and non-intuitive. Add a
> subcommand, 'git sparse-checkout disable', to perform those
> steps for the user.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  Documentation/git-sparse-checkout.txt | 26 ++++++++-----------
>  builtin/sparse-checkout.c             | 37 ++++++++++++++++++++++++---
>  t/t1091-sparse-checkout-builtin.sh    | 15 +++++++++++
>  3 files changed, 59 insertions(+), 19 deletions(-)
>
> diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
> index 87813e5797..da95b28b1c 100644
> --- a/Documentation/git-sparse-checkout.txt
> +++ b/Documentation/git-sparse-checkout.txt
> @@ -39,6 +39,10 @@ COMMANDS
>         a list of arguments following the 'set' subcommand. Update the
>         working directory to match the new patterns.
>
> +'disable'::
> +       Remove the sparse-checkout file, set `core.sparseCheckout` to
> +       `false`, and restore the working directory to include all files.

Good, so 'init' (and maybe 'set'?) will set core.sparseCheckout, and
disable will unset it, so the user doesn't have to worry about it...

> +
>  SPARSE CHECKOUT
>  ----------------
>
> @@ -61,6 +65,13 @@ Then it compares the new skip-worktree value with the previous one. If
>  skip-worktree turns from set to unset, it will add the corresponding
>  file back. If it turns from unset to set, that file will be removed.
>
> +To repopulate the working directory with all files, use the
> +`git sparse-checkout disable` command.

Good.

> +Sparse checkout support in 'git checkout' and similar commands is
> +disabled by default. You need to set `core.sparseCheckout` to `true`
> +in order to have sparse checkout support.

Aren't we having the user use 'git sparse-checkout init' to do that?
Why guide them to the core.sparseCheckout option?  And why mention it
without extensions.worktreeConfig?

> +
>  ## FULL PATTERN SET
>
>  By default, the sparse-checkout file uses the same syntax as `.gitignore`
> @@ -75,21 +86,6 @@ using negative patterns. For example, to remove the file `unwanted`:
>  !unwanted
>  ----------------
>
> -Another tricky thing is fully repopulating the working directory when you
> -no longer want sparse checkout. You cannot just disable "sparse
> -checkout" because skip-worktree bits are still in the index and your working
> -directory is still sparsely populated. You should re-populate the working
> -directory with the `$GIT_DIR/info/sparse-checkout` file content as
> -follows:
> -
> -----------------
> -/*
> -----------------

Yaay, glad to see this removed.

> -Then you can disable sparse checkout. Sparse checkout support in 'git
> -read-tree' and similar commands is disabled by default. You need to
> -set `core.sparseCheckout` to `true` in order to have sparse checkout
> -support.
>
>  SEE ALSO
>  --------
> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
> index f726fcd6b8..f858f0b1b5 100644
> --- a/builtin/sparse-checkout.c
> +++ b/builtin/sparse-checkout.c
> @@ -8,7 +8,7 @@
>  #include "strbuf.h"
>
>  static char const * const builtin_sparse_checkout_usage[] = {
> -       N_("git sparse-checkout [init|list|set] <options>"),
> +       N_("git sparse-checkout [init|list|set|disable] <options>"),
>         NULL
>  };
>
> @@ -74,7 +74,7 @@ static int update_working_directory(void)
>         return result;
>  }
>
> -static int sc_enable_config(void)
> +static int sc_set_config(int mode)

Nice to see this change from the RFC round; do we want to use an enum
instead of an int, or is the int good enough?  (No strong opinion
here, just asking.)

>  {
>         struct argv_array argv = ARGV_ARRAY_INIT;
>
> @@ -83,7 +83,12 @@ static int sc_enable_config(void)
>                 return 1;
>         }
>
> -       argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", "true", NULL);
> +       argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", NULL);
> +
> +       if (mode)
> +               argv_array_pushl(&argv, "true", NULL);
> +       else
> +               argv_array_pushl(&argv, "false", NULL);
>
>         if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
>                 error(_("failed to enable core.sparseCheckout"));
> @@ -101,7 +106,7 @@ static int sparse_checkout_init(int argc, const char **argv)
>         int res;
>         struct object_id oid;
>
> -       if (sc_enable_config())
> +       if (sc_set_config(1))
>                 return 1;
>
>         memset(&pl, 0, sizeof(pl));
> @@ -188,6 +193,28 @@ static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
>         return write_patterns_and_update(&pl);
>  }
>
> +static int sparse_checkout_disable(int argc, const char **argv)
> +{
> +       char *sparse_filename;
> +       FILE *fp;
> +
> +       if (sc_set_config(1))
> +               die(_("failed to change config"));
> +
> +       sparse_filename = get_sparse_checkout_filename();
> +       fp = fopen(sparse_filename, "w");
> +       fprintf(fp, "/*\n");
> +       fclose(fp);
> +
> +       if (update_working_directory())
> +               die(_("error while refreshing working directory"));
> +
> +       unlink(sparse_filename);
> +       free(sparse_filename);
> +
> +       return sc_set_config(0);
> +}

So we update the .git/info/sparse-checkout file first (or the
worktree-specific equivalent), then call update_working_directory()
which can fail -- in particular if the user calls it when they have
any conflicted files.  But then the sparse-checkout file has already
been emptied, so it did make some changes, just not all the changes
the user would expect, leaving them in an intermediate state with an
error message that doesn't explain how to recover.  Would it be worth
checking for this case, and telling the user to fix up conflicts then
re-run the disable command?  Would it make more sense to just replace
the 'read-tree -mu HEAD' with something that doesn't error out in such
a case?  Or is this just a shortcoming of an experimental feature that
we'll get to later?  (I'm okay with the last of those, since we also
still need to address defaults of several other commands when sparse
checkouts are active[1].)

[1] https://public-inbox.org/git/CABPp-BGuFhDwWZBRaD3nA8ui46wor-4=Ha1G1oApsfF8KNpfGQ@mail.gmail.com/

> +
>  int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
>  {
>         static struct option builtin_sparse_checkout_options[] = {
> @@ -212,6 +239,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
>                         return sparse_checkout_init(argc, argv);
>                 if (!strcmp(argv[0], "set"))
>                         return sparse_checkout_set(argc, argv, prefix);
> +               if (!strcmp(argv[0], "disable"))
> +                       return sparse_checkout_disable(argc, argv);
>         }
>
>         usage_with_options(builtin_sparse_checkout_usage,
> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> index 02ba9ec314..22fa032d6d 100755
> --- a/t/t1091-sparse-checkout-builtin.sh
> +++ b/t/t1091-sparse-checkout-builtin.sh
> @@ -140,5 +140,20 @@ test_expect_success 'set sparse-checkout using --stdin' '
>         test_cmp expect dir
>  '
>
> +test_expect_success 'sparse-checkout disable' '
> +       git -C repo sparse-checkout disable &&
> +       test_path_is_missing repo/.git/info/sparse-checkout &&
> +       git -C repo config --list >config &&
> +       test_i18ngrep "core.sparsecheckout=false" config &&
> +       ls repo >dir &&
> +       cat >expect <<-EOF &&
> +               a
> +               deep
> +               folder1
> +               folder2
> +       EOF
> +       test_cmp expect dir
> +'
> +
>  test_done

The rest of the patch looks good.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 07/11] trace2: add region in clear_ce_flags
  2019-09-19 14:43   ` [PATCH v2 07/11] trace2: add region in clear_ce_flags Jeff Hostetler via GitGitGadget
@ 2019-10-06  4:13     ` Elijah Newren
  0 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-10-06  4:13 UTC (permalink / raw)
  To: Jeff Hostetler via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Jeff Hostetler

On Thu, Sep 19, 2019 at 10:15 AM Jeff Hostetler via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Jeff Hostetler <jeffhost@microsoft.com>
>
> When Git updates the working directory with the sparse-checkout
> feature enabled, the unpack_trees() method calls clear_ce_flags()
> to update the skip-wortree bits on the cache entries. This
> check can be expensive, depending on the patterns used.
>
> Add trace2 regions around the method, including some flag
> information, so we can get granular performance data during
> experiments. This data will be used to measure improvements
> to the pattern-matching algorithms for sparse-checkout.
>
> Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  unpack-trees.c | 10 +++++++++-
>  1 file changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/unpack-trees.c b/unpack-trees.c
> index cd548f4fa2..26be8f3569 100644
> --- a/unpack-trees.c
> +++ b/unpack-trees.c
> @@ -1404,15 +1404,23 @@ static int clear_ce_flags(struct index_state *istate,
>                           struct pattern_list *pl)
>  {
>         static struct strbuf prefix = STRBUF_INIT;
> +       char label[100];
> +       int rval;
>
>         strbuf_reset(&prefix);
>
> -       return clear_ce_flags_1(istate,
> +       xsnprintf(label, sizeof(label), "clear_ce_flags(0x%08lx,0x%08lx)",
> +                 (unsigned long)select_mask, (unsigned long)clear_mask);
> +       trace2_region_enter("unpack_trees", label, the_repository);
> +       rval = clear_ce_flags_1(istate,
>                                 istate->cache,
>                                 istate->cache_nr,
>                                 &prefix,
>                                 select_mask, clear_mask,
>                                 pl, 0);
> +       trace2_region_leave("unpack_trees", label, the_repository);
> +
> +       return rval;
>  }
>
>  /*
> --
> gitgitgadget

Thanks for the updates to the commit message, and the tweaks from
"exp" to "unpack_trees" in the patch.  I still don't know trace2, but
it's much clearer how this relates to the series now.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 08/11] sparse-checkout: add 'cone' mode
  2019-09-19 14:43   ` [PATCH v2 08/11] sparse-checkout: add 'cone' mode Derrick Stolee via GitGitGadget
@ 2019-10-06  4:22     ` Elijah Newren
  2019-10-07 19:15       ` Derrick Stolee
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-10-06  4:22 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Thu, Sep 19, 2019 at 1:45 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> The sparse-checkout feature can have quadratic performance as
> the number of patterns and number of entries in the index grow.
> If there are 1,000 patterns and 1,000,000 entries, this time can
> be very significant.
>
> Create a new Boolean config option, core.sparseCheckoutCone, to
> indicate that we expect the sparse-checkout file to contain a
> more limited set of patterns. This is a separate config setting
> from core.sparseCheckout to avoid breaking older clients by
> introcuding a tri-state option.

s/introcuding/introducing/

> The config option does nothing right now, but will be expanded
> upon in a later commit.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  Documentation/config/core.txt         |  7 ++--
>  Documentation/git-sparse-checkout.txt | 50 +++++++++++++++++++++++++++
>  cache.h                               |  4 ++-
>  config.c                              |  5 +++
>  environment.c                         |  1 +
>  t/t1091-sparse-checkout-builtin.sh    | 14 ++++++++
>  6 files changed, 78 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
> index 75538d27e7..9b8ab2a6d4 100644
> --- a/Documentation/config/core.txt
> +++ b/Documentation/config/core.txt
> @@ -591,8 +591,11 @@ core.multiPackIndex::
>         multi-pack-index design document].
>
>  core.sparseCheckout::
> -       Enable "sparse checkout" feature. See section "Sparse checkout" in
> -       linkgit:git-read-tree[1] for more information.
> +       Enable "sparse checkout" feature. If "false", then sparse-checkout
> +       is disabled. If "true", then sparse-checkout is enabled with the full
> +       .gitignore pattern set. If "cone", then sparse-checkout is enabled with
> +       a restricted pattern set. See linkgit:git-sparse-checkout[1] for more
> +       information.

This isn't consistent with the commit message that suggests it's a new
option rather than a new possible value for an old option.

>  core.abbrev::
>         Set the length object names are abbreviated to.  If
> diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
> index da95b28b1c..757326618d 100644
> --- a/Documentation/git-sparse-checkout.txt
> +++ b/Documentation/git-sparse-checkout.txt
> @@ -87,6 +87,56 @@ using negative patterns. For example, to remove the file `unwanted`:
>  ----------------
>
>
> +## CONE PATTERN SET
> +
> +The full pattern set allows for arbitrary pattern matches and complicated
> +inclusion/exclusion rules. These can result in O(N*M) pattern matches when
> +updating the index, where N is the number of patterns and M is the number
> +of paths in the index. To combat this performance issue, a more restricted
> +pattern set is allowed when `core.spareCheckoutCone` is enabled.
> +
> +The accepted patterns in the cone pattern set are:
> +
> +1. *Recursive:* All paths inside a directory are included.
> +
> +2. *Parent:* All files immediately inside a directory are included.
> +
> +In addition to the above two patterns, we also expect that all files in the
> +root directory are included. If a recursive pattern is added, then all
> +leading directories are added as parent patterns.
> +
> +By default, when running `git sparse-checkout init`, the root directory is
> +added as a parent pattern. At this point, the sparse-checkout file contains
> +the following patterns:
> +
> +```
> +/*
> +!/*/
> +```
> +
> +This says "include everything in root, but nothing two levels below root."
> +If we then add the folder `A/B/C` as a recursive pattern, the folders `A` and
> +`A/B` are added as parent patterns. The resulting sparse-checkout file is
> +now
> +
> +```
> +/*
> +!/*/
> +/A/
> +!/A/*/
> +/A/B/
> +!/A/B/*/
> +/A/B/C/
> +```
> +
> +Here, order matters, so the negative patterns are overridden by the positive
> +patterns that appear lower in the file.
> +
> +If `core.sparseCheckoutCone=true`, then Git will parse the sparse-checkout file
> +expecting patterns of these types. Git will warn if the patterns do not match.
> +If the patterns do match the expected format, then Git will use faster hash-
> +based algorithms to compute inclusion in the sparse-checkout.
> +
>  SEE ALSO
>  --------
>
> diff --git a/cache.h b/cache.h
> index cf5d70c196..8e8ea67efa 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -911,12 +911,14 @@ extern char *git_replace_ref_base;
>
>  extern int fsync_object_files;
>  extern int core_preload_index;
> -extern int core_apply_sparse_checkout;
>  extern int precomposed_unicode;
>  extern int protect_hfs;
>  extern int protect_ntfs;
>  extern const char *core_fsmonitor;
>
> +int core_apply_sparse_checkout;
> +int core_sparse_checkout_cone;
> +
>  /*
>   * Include broken refs in all ref iterations, which will
>   * generally choke dangerous operations rather than letting
> diff --git a/config.c b/config.c
> index 296a6d9cc4..f65c74f5b7 100644
> --- a/config.c
> +++ b/config.c
> @@ -1329,6 +1329,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
>                 return 0;
>         }
>
> +       if (!strcmp(var, "core.sparsecheckoutcone")) {
> +               core_sparse_checkout_cone = git_config_bool(var, value);
> +               return 0;
> +       }
> +
>         if (!strcmp(var, "core.precomposeunicode")) {
>                 precomposed_unicode = git_config_bool(var, value);
>                 return 0;
> diff --git a/environment.c b/environment.c
> index 89af47cb85..670d92bcc0 100644
> --- a/environment.c
> +++ b/environment.c
> @@ -69,6 +69,7 @@ enum object_creation_mode object_creation_mode = OBJECT_CREATION_MODE;
>  char *notes_ref_name;
>  int grafts_replace_parents = 1;
>  int core_apply_sparse_checkout;
> +int core_sparse_checkout_cone;
>  int merge_log_config = -1;
>  int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */
>  unsigned long pack_size_limit_cfg;
> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> index 22fa032d6d..9b089c98c4 100755
> --- a/t/t1091-sparse-checkout-builtin.sh
> +++ b/t/t1091-sparse-checkout-builtin.sh
> @@ -140,6 +140,20 @@ test_expect_success 'set sparse-checkout using --stdin' '
>         test_cmp expect dir
>  '
>
> +test_expect_success 'cone mode: match patterns' '
> +       git -C repo config --worktree core.sparseCheckoutCone true &&
> +       rm -rf repo/a repo/folder1 repo/folder2 &&
> +       git -C repo read-tree -mu HEAD &&
> +       git -C repo reset --hard &&
> +       ls repo >dir  &&
> +       cat >expect <<-EOF &&
> +               a
> +               folder1
> +               folder2
> +       EOF
> +       test_cmp expect dir
> +'
> +
>  test_expect_success 'sparse-checkout disable' '
>         git -C repo sparse-checkout disable &&
>         test_path_is_missing repo/.git/info/sparse-checkout &&
> --
> gitgitgadget

What if core.sparseCheckoutCone is true but core.sparseCheckout is
false?  Is that an error case we warn the user about, or do we make
sense of it somehow?

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 03/11] clone: add --sparse mode
  2019-10-05 19:40     ` Elijah Newren
@ 2019-10-07 13:56       ` Derrick Stolee
  0 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-10-07 13:56 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On 10/5/2019 3:40 PM, Elijah Newren wrote:
> On Thu, Sep 19, 2019 at 3:06 PM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
> 
>> During the 'git sparse-checkout init' call, we must first look
>> to see if HEAD is valid, or else we will fail while trying to
>> update the working directory. The first checkout will actually
>> update the working directory correctly.
> 
> This is new since the RFC series, but I'm not sure I understand.  Is
> the issue you're fixing here that a 'git init somerepo' would hit this
> codepath and print funny errors because HEAD doesn't exist yet and
> thus the whole `git read-tree -mu HEAD` stuff can't work?  Or that
> when the remote has HEAD pointing at a bad commit that you get error
> messages different than expected?

At the point where `git clone --sparse` calls `git sparse-checkout init`,
there is no HEAD. We need to initialize the sparse-checkout before the
clone operation populates the working directory and creates the HEAD
ref. For that reason, `git read-tree -mu HEAD` wouldn't work. But that's
fine, since there is nothing to do. The index update will happen later.

> 
>> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
>> index 895479970d..656e6ebdd5 100644
>> --- a/builtin/sparse-checkout.c
>> +++ b/builtin/sparse-checkout.c
>> @@ -99,6 +99,7 @@ static int sparse_checkout_init(int argc, const char **argv)
>>         char *sparse_filename;
>>         FILE *fp;
>>         int res;
>> +       struct object_id oid;
>>
>>         if (sc_enable_config())
>>                 return 1;
>> @@ -120,6 +121,11 @@ static int sparse_checkout_init(int argc, const char **argv)
>>         fprintf(fp, "/*\n!/*/\n");
>>         fclose(fp);
>>
>> +       if (get_oid("HEAD", &oid)) {
>> +               /* assume we are in a fresh repo */
>> +               return 0;
>> +       }
>> +
>>  reset_dir:
>>         return update_working_directory();
>>  }


^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 04/11] sparse-checkout: 'set' subcommand
  2019-10-06  0:30       ` Elijah Newren
@ 2019-10-07 18:26         ` Derrick Stolee
  2019-10-11 22:24           ` Elijah Newren
  0 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee @ 2019-10-07 18:26 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On 10/5/2019 8:30 PM, Elijah Newren wrote:
> On Sat, Oct 5, 2019 at 3:44 PM Elijah Newren <newren@gmail.com> wrote:
>>
>> On Thu, Sep 19, 2019 at 3:07 PM Derrick Stolee via GitGitGadget
>> <gitgitgadget@gmail.com> wrote:
>>> +static int write_patterns_and_update(struct pattern_list *pl)
>>> +{
>>> +       char *sparse_filename;
>>> +       FILE *fp;
>>> +
>>> +       sparse_filename = get_sparse_checkout_filename();
>>> +       fp = fopen(sparse_filename, "w");
>>> +       write_patterns_to_file(fp, pl);
>>> +       fclose(fp);
>>> +       free(sparse_filename);
>>> +
>>> +       clear_pattern_list(pl);
>>
>> It seems slightly odd that pl is passed in but cleared in this
>> function rather than in the caller that created pl.  Should this be
>> moved to the caller, or, alternatively, a comment added to explain
>> this side-effect for future callers of the function?
>>
>> The rest of the patch looked good to me.
> 
> Actually, thought of something else.  What if the user calls 'git
> sparse-checkout set ...' without first calling 'git sparse-checkout
> init'?  Should that report an error to the user, a suggestion to
> follow it up with 'sparse-checkout init', or should it just call
> sc_set_config() behind the scenes and allow bypassing the init
> subcommand?

Maybe a warning would suffice. I still think the workflow of the
following is most correct, and not difficult to recommend:

* "git sparse-checkout init [--cone]" -OR- "git clone --sparse"
* git sparse-checkout set [stuff]
* git sparse-checkout disable

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 06/11] sparse-checkout: create 'disable' subcommand
  2019-10-06  4:10     ` Elijah Newren
@ 2019-10-07 19:12       ` Derrick Stolee
  0 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-10-07 19:12 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On 10/6/2019 12:10 AM, Elijah Newren wrote:
> On Thu, Sep 19, 2019 at 1:46 PM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> The instructions for disabling a sparse-checkout to a full
>> working directory are complicated and non-intuitive. Add a
>> subcommand, 'git sparse-checkout disable', to perform those
>> steps for the user.
>>
>> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
>> ---
>>  Documentation/git-sparse-checkout.txt | 26 ++++++++-----------
>>  builtin/sparse-checkout.c             | 37 ++++++++++++++++++++++++---
>>  t/t1091-sparse-checkout-builtin.sh    | 15 +++++++++++
>>  3 files changed, 59 insertions(+), 19 deletions(-)
>>
>> diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
>> index 87813e5797..da95b28b1c 100644
>> --- a/Documentation/git-sparse-checkout.txt
>> +++ b/Documentation/git-sparse-checkout.txt
>> @@ -39,6 +39,10 @@ COMMANDS
>>         a list of arguments following the 'set' subcommand. Update the
>>         working directory to match the new patterns.
>>
>> +'disable'::
>> +       Remove the sparse-checkout file, set `core.sparseCheckout` to
>> +       `false`, and restore the working directory to include all files.
> 
> Good, so 'init' (and maybe 'set'?) will set core.sparseCheckout, and
> disable will unset it, so the user doesn't have to worry about it...
> 
>> +
>>  SPARSE CHECKOUT
>>  ----------------
>>
>> @@ -61,6 +65,13 @@ Then it compares the new skip-worktree value with the previous one. If
>>  skip-worktree turns from set to unset, it will add the corresponding
>>  file back. If it turns from unset to set, that file will be removed.
>>
>> +To repopulate the working directory with all files, use the
>> +`git sparse-checkout disable` command.
> 
> Good.
> 
>> +Sparse checkout support in 'git checkout' and similar commands is
>> +disabled by default. You need to set `core.sparseCheckout` to `true`
>> +in order to have sparse checkout support.
> 
> Aren't we having the user use 'git sparse-checkout init' to do that?
> Why guide them to the core.sparseCheckout option?  And why mention it
> without extensions.worktreeConfig?

I'll add a paragraph above the "To repopulate..." to describe using 'init'
and 'set' instead of relying on the old phrasing.

>> +
>>  ## FULL PATTERN SET
>>
>>  By default, the sparse-checkout file uses the same syntax as `.gitignore`
>> @@ -75,21 +86,6 @@ using negative patterns. For example, to remove the file `unwanted`:
>>  !unwanted
>>  ----------------
>>
>> -Another tricky thing is fully repopulating the working directory when you
>> -no longer want sparse checkout. You cannot just disable "sparse
>> -checkout" because skip-worktree bits are still in the index and your working
>> -directory is still sparsely populated. You should re-populate the working
>> -directory with the `$GIT_DIR/info/sparse-checkout` file content as
>> -follows:
>> -
>> -----------------
>> -/*
>> -----------------
> 
> Yaay, glad to see this removed.
> 
>> -Then you can disable sparse checkout. Sparse checkout support in 'git
>> -read-tree' and similar commands is disabled by default. You need to
>> -set `core.sparseCheckout` to `true` in order to have sparse checkout
>> -support.
>>
>>  SEE ALSO
>>  --------
>> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
>> index f726fcd6b8..f858f0b1b5 100644
>> --- a/builtin/sparse-checkout.c
>> +++ b/builtin/sparse-checkout.c
>> @@ -8,7 +8,7 @@
>>  #include "strbuf.h"
>>
>>  static char const * const builtin_sparse_checkout_usage[] = {
>> -       N_("git sparse-checkout [init|list|set] <options>"),
>> +       N_("git sparse-checkout [init|list|set|disable] <options>"),
>>         NULL
>>  };
>>
>> @@ -74,7 +74,7 @@ static int update_working_directory(void)
>>         return result;
>>  }
>>
>> -static int sc_enable_config(void)
>> +static int sc_set_config(int mode)
> 
> Nice to see this change from the RFC round; do we want to use an enum
> instead of an int, or is the int good enough?  (No strong opinion
> here, just asking.)

I'll use an enum in v3.

>>  {
>>         struct argv_array argv = ARGV_ARRAY_INIT;
>>
>> @@ -83,7 +83,12 @@ static int sc_enable_config(void)
>>                 return 1;
>>         }
>>
>> -       argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", "true", NULL);
>> +       argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", NULL);
>> +
>> +       if (mode)
>> +               argv_array_pushl(&argv, "true", NULL);
>> +       else
>> +               argv_array_pushl(&argv, "false", NULL);
>>
>>         if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
>>                 error(_("failed to enable core.sparseCheckout"));
>> @@ -101,7 +106,7 @@ static int sparse_checkout_init(int argc, const char **argv)
>>         int res;
>>         struct object_id oid;
>>
>> -       if (sc_enable_config())
>> +       if (sc_set_config(1))
>>                 return 1;
>>
>>         memset(&pl, 0, sizeof(pl));
>> @@ -188,6 +193,28 @@ static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
>>         return write_patterns_and_update(&pl);
>>  }
>>
>> +static int sparse_checkout_disable(int argc, const char **argv)
>> +{
>> +       char *sparse_filename;
>> +       FILE *fp;
>> +
>> +       if (sc_set_config(1))
>> +               die(_("failed to change config"));
>> +
>> +       sparse_filename = get_sparse_checkout_filename();
>> +       fp = fopen(sparse_filename, "w");
>> +       fprintf(fp, "/*\n");
>> +       fclose(fp);
>> +
>> +       if (update_working_directory())
>> +               die(_("error while refreshing working directory"));
>> +
>> +       unlink(sparse_filename);
>> +       free(sparse_filename);
>> +
>> +       return sc_set_config(0);
>> +}
> 
> So we update the .git/info/sparse-checkout file first (or the
> worktree-specific equivalent), then call update_working_directory()
> which can fail -- in particular if the user calls it when they have
> any conflicted files.  But then the sparse-checkout file has already
> been emptied, so it did make some changes, just not all the changes
> the user would expect, leaving them in an intermediate state with an
> error message that doesn't explain how to recover.  Would it be worth
> checking for this case, and telling the user to fix up conflicts then
> re-run the disable command?  Would it make more sense to just replace
> the 'read-tree -mu HEAD' with something that doesn't error out in such
> a case?  Or is this just a shortcoming of an experimental feature that
> we'll get to later?  (I'm okay with the last of those, since we also
> still need to address defaults of several other commands when sparse
> checkouts are active[1].)

I think there are multiple edge cases that make the sparse-checkout
feature worthy of the "experimental" descriptor. To be explicit about
a case where update_working_directory() would fail when the sparse-checkout
file only contains "/*", the only case I can think of is when a user has
written a file outside the current sparse set but HEAD thinks that path
should be a folder (or vice-versa).

We will definitely want to make the feature more robust to these corner
cases, but that will take time. For now, let's get a framework that is
functional for 99% of cases.

And this must be said: none of these changes are permanently damaging.
If a user gets in a strange state due to these corner cases, they are
no worse off than they would be trying to follow the existing directions.

And in v3, I'll add some new commits that help these kinds of cases
during the 'set' operation by not writing to the sparse-checkout file
until the working directory update has succeeded.

> [1] https://public-inbox.org/git/CABPp-BGuFhDwWZBRaD3nA8ui46wor-4=Ha1G1oApsfF8KNpfGQ@mail.gmail.com/
> 
>> +
>>  int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
>>  {
>>         static struct option builtin_sparse_checkout_options[] = {
>> @@ -212,6 +239,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
>>                         return sparse_checkout_init(argc, argv);
>>                 if (!strcmp(argv[0], "set"))
>>                         return sparse_checkout_set(argc, argv, prefix);
>> +               if (!strcmp(argv[0], "disable"))
>> +                       return sparse_checkout_disable(argc, argv);
>>         }
>>
>>         usage_with_options(builtin_sparse_checkout_usage,
>> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
>> index 02ba9ec314..22fa032d6d 100755
>> --- a/t/t1091-sparse-checkout-builtin.sh
>> +++ b/t/t1091-sparse-checkout-builtin.sh
>> @@ -140,5 +140,20 @@ test_expect_success 'set sparse-checkout using --stdin' '
>>         test_cmp expect dir
>>  '
>>
>> +test_expect_success 'sparse-checkout disable' '
>> +       git -C repo sparse-checkout disable &&
>> +       test_path_is_missing repo/.git/info/sparse-checkout &&
>> +       git -C repo config --list >config &&
>> +       test_i18ngrep "core.sparsecheckout=false" config &&
>> +       ls repo >dir &&
>> +       cat >expect <<-EOF &&
>> +               a
>> +               deep
>> +               folder1
>> +               folder2
>> +       EOF
>> +       test_cmp expect dir
>> +'
>> +
>>  test_done
> 
> The rest of the patch looks good.
> 


^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 08/11] sparse-checkout: add 'cone' mode
  2019-10-06  4:22     ` Elijah Newren
@ 2019-10-07 19:15       ` Derrick Stolee
  0 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-10-07 19:15 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On 10/6/2019 12:22 AM, Elijah Newren wrote:
> On Thu, Sep 19, 2019 at 1:45 PM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> The sparse-checkout feature can have quadratic performance as
>> the number of patterns and number of entries in the index grow.
>> If there are 1,000 patterns and 1,000,000 entries, this time can
>> be very significant.
>>
>> Create a new Boolean config option, core.sparseCheckoutCone, to
>> indicate that we expect the sparse-checkout file to contain a
>> more limited set of patterns. This is a separate config setting
>> from core.sparseCheckout to avoid breaking older clients by
>> introcuding a tri-state option.
> 
> s/introcuding/introducing/
> 
>> The config option does nothing right now, but will be expanded
>> upon in a later commit.
>>
>> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
>> ---
>>  Documentation/config/core.txt         |  7 ++--
>>  Documentation/git-sparse-checkout.txt | 50 +++++++++++++++++++++++++++
>>  cache.h                               |  4 ++-
>>  config.c                              |  5 +++
>>  environment.c                         |  1 +
>>  t/t1091-sparse-checkout-builtin.sh    | 14 ++++++++
>>  6 files changed, 78 insertions(+), 3 deletions(-)
>>
>> diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
>> index 75538d27e7..9b8ab2a6d4 100644
>> --- a/Documentation/config/core.txt
>> +++ b/Documentation/config/core.txt
>> @@ -591,8 +591,11 @@ core.multiPackIndex::
>>         multi-pack-index design document].
>>
>>  core.sparseCheckout::
>> -       Enable "sparse checkout" feature. See section "Sparse checkout" in
>> -       linkgit:git-read-tree[1] for more information.
>> +       Enable "sparse checkout" feature. If "false", then sparse-checkout
>> +       is disabled. If "true", then sparse-checkout is enabled with the full
>> +       .gitignore pattern set. If "cone", then sparse-checkout is enabled with
>> +       a restricted pattern set. See linkgit:git-sparse-checkout[1] for more
>> +       information.
> 
> This isn't consistent with the commit message that suggests it's a new
> option rather than a new possible value for an old option.

Thanks for the catch. I forgot to update the docs. Fixed for v3.

>>  core.abbrev::
>>         Set the length object names are abbreviated to.  If
>> diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
>> index da95b28b1c..757326618d 100644
>> --- a/Documentation/git-sparse-checkout.txt
>> +++ b/Documentation/git-sparse-checkout.txt
>> @@ -87,6 +87,56 @@ using negative patterns. For example, to remove the file `unwanted`:
>>  ----------------
>>
>>
>> +## CONE PATTERN SET
>> +
>> +The full pattern set allows for arbitrary pattern matches and complicated
>> +inclusion/exclusion rules. These can result in O(N*M) pattern matches when
>> +updating the index, where N is the number of patterns and M is the number
>> +of paths in the index. To combat this performance issue, a more restricted
>> +pattern set is allowed when `core.spareCheckoutCone` is enabled.
>> +
>> +The accepted patterns in the cone pattern set are:
>> +
>> +1. *Recursive:* All paths inside a directory are included.
>> +
>> +2. *Parent:* All files immediately inside a directory are included.
>> +
>> +In addition to the above two patterns, we also expect that all files in the
>> +root directory are included. If a recursive pattern is added, then all
>> +leading directories are added as parent patterns.
>> +
>> +By default, when running `git sparse-checkout init`, the root directory is
>> +added as a parent pattern. At this point, the sparse-checkout file contains
>> +the following patterns:
>> +
>> +```
>> +/*
>> +!/*/
>> +```
>> +
>> +This says "include everything in root, but nothing two levels below root."
>> +If we then add the folder `A/B/C` as a recursive pattern, the folders `A` and
>> +`A/B` are added as parent patterns. The resulting sparse-checkout file is
>> +now
>> +
>> +```
>> +/*
>> +!/*/
>> +/A/
>> +!/A/*/
>> +/A/B/
>> +!/A/B/*/
>> +/A/B/C/
>> +```
>> +
>> +Here, order matters, so the negative patterns are overridden by the positive
>> +patterns that appear lower in the file.
>> +
>> +If `core.sparseCheckoutCone=true`, then Git will parse the sparse-checkout file
>> +expecting patterns of these types. Git will warn if the patterns do not match.
>> +If the patterns do match the expected format, then Git will use faster hash-
>> +based algorithms to compute inclusion in the sparse-checkout.
>> +
>>  SEE ALSO
>>  --------
>>
>> diff --git a/cache.h b/cache.h
>> index cf5d70c196..8e8ea67efa 100644
>> --- a/cache.h
>> +++ b/cache.h
>> @@ -911,12 +911,14 @@ extern char *git_replace_ref_base;
>>
>>  extern int fsync_object_files;
>>  extern int core_preload_index;
>> -extern int core_apply_sparse_checkout;
>>  extern int precomposed_unicode;
>>  extern int protect_hfs;
>>  extern int protect_ntfs;
>>  extern const char *core_fsmonitor;
>>
>> +int core_apply_sparse_checkout;
>> +int core_sparse_checkout_cone;
>> +
>>  /*
>>   * Include broken refs in all ref iterations, which will
>>   * generally choke dangerous operations rather than letting
>> diff --git a/config.c b/config.c
>> index 296a6d9cc4..f65c74f5b7 100644
>> --- a/config.c
>> +++ b/config.c
>> @@ -1329,6 +1329,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
>>                 return 0;
>>         }
>>
>> +       if (!strcmp(var, "core.sparsecheckoutcone")) {
>> +               core_sparse_checkout_cone = git_config_bool(var, value);
>> +               return 0;
>> +       }
>> +
>>         if (!strcmp(var, "core.precomposeunicode")) {
>>                 precomposed_unicode = git_config_bool(var, value);
>>                 return 0;
>> diff --git a/environment.c b/environment.c
>> index 89af47cb85..670d92bcc0 100644
>> --- a/environment.c
>> +++ b/environment.c
>> @@ -69,6 +69,7 @@ enum object_creation_mode object_creation_mode = OBJECT_CREATION_MODE;
>>  char *notes_ref_name;
>>  int grafts_replace_parents = 1;
>>  int core_apply_sparse_checkout;
>> +int core_sparse_checkout_cone;
>>  int merge_log_config = -1;
>>  int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */
>>  unsigned long pack_size_limit_cfg;
>> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
>> index 22fa032d6d..9b089c98c4 100755
>> --- a/t/t1091-sparse-checkout-builtin.sh
>> +++ b/t/t1091-sparse-checkout-builtin.sh
>> @@ -140,6 +140,20 @@ test_expect_success 'set sparse-checkout using --stdin' '
>>         test_cmp expect dir
>>  '
>>
>> +test_expect_success 'cone mode: match patterns' '
>> +       git -C repo config --worktree core.sparseCheckoutCone true &&
>> +       rm -rf repo/a repo/folder1 repo/folder2 &&
>> +       git -C repo read-tree -mu HEAD &&
>> +       git -C repo reset --hard &&
>> +       ls repo >dir  &&
>> +       cat >expect <<-EOF &&
>> +               a
>> +               folder1
>> +               folder2
>> +       EOF
>> +       test_cmp expect dir
>> +'
>> +
>>  test_expect_success 'sparse-checkout disable' '
>>         git -C repo sparse-checkout disable &&
>>         test_path_is_missing repo/.git/info/sparse-checkout &&
>> --
>> gitgitgadget
> 
> What if core.sparseCheckoutCone is true but core.sparseCheckout is
> false?  Is that an error case we warn the user about, or do we make
> sense of it somehow?

`core.sparseCheckoutCone` means "when you are doing sparse-checkout things,
try to use the fast logic assuming you have the cone patterns." It does not
enable the sparse-checkout on its own. It only changes run-time performance.

This is important for compatibility with other clients that will only look
at `core.sparseCheckout`.

Thanks,
-Stolee


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 00/17] New sparse-checkout builtin and "cone" mode
  2019-09-19 14:43 ` [PATCH v2 00/11] " Derrick Stolee via GitGitGadget
                     ` (11 preceding siblings ...)
  2019-10-01 13:40   ` [PATCH v2 00/11] New sparse-checkout builtin and "cone" mode Derrick Stolee
@ 2019-10-07 20:08   ` " Derrick Stolee via GitGitGadget
  2019-10-07 20:08     ` [PATCH v3 01/17] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
                       ` (18 more replies)
  12 siblings, 19 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano

This series makes the sparse-checkout feature more user-friendly. While
there, I also present a way to use a limited set of patterns to gain a
significant performance boost in very large repositories.

Sparse-checkout is only documented as a subsection of the read-tree docs
[1], which makes the feature hard to discover. Users have trouble navigating
the feature, especially at clone time [2], and have even resorted to
creating their own helper tools [3].

This series attempts to solve these problems using a new builtin. Here is a
sample workflow to give a feeling for how it can work:

In an existing repo:

$ git sparse-checkout init
$ ls
myFile1.txt myFile2.txt
$ git sparse-checkout set "/*" "!/*/" /myFolder/
$ ls
myFile1.txt myFile2.txt myFolder
$ ls myFolder
a.c a.h
$ git sparse-checkout disable
$ ls
hiddenFolder myFile1.txt myFile2.txt myFolder

At clone time:

$ git clone --sparse origin repo
$ cd repo
$ ls
myFile1.txt myFile2.txt
$ git sparse-checkout set "/*" "!/*/" /myFolder/
$ ls
myFile1.txt myFile2.txt myFolder

Here are some more specific details:

 * git sparse-checkout init enables core.sparseCheckout and populates the
   sparse-checkout file with patterns that match only the files at root.
   
   
 * git clone learns the --sparse argument to run git sparse-checkout init 
   before the first checkout.
   
   
 * git sparse-checkout set reads patterns from the arguments, or with
   --stdin reads patterns from stdin one per line, then writes them to the
   sparse-checkout file and refreshes the working directory.
   
   
 * git sparse-checkout disable removes the patterns from the sparse-checkout
   file, disables core.sparseCheckout, and refills the working directory.
   
   
 * git sparse-checkout list lists the contents of the sparse-checkout file.
   
   

The documentation for the sparse-checkout feature can now live primarily
with the git-sparse-checkout documentation.

Cone Mode
=========

What really got me interested in this area is a performance problem. If we
have N patterns in the sparse-checkout file and M entries in the index, then
we can perform up to O(N * M) pattern checks in clear_ce_flags(). This
quadratic growth is not sustainable in a repo with 1,000+ patterns and
1,000,000+ index entries.

To solve this problem, I propose a new, more restrictive mode to
sparse-checkout: "cone mode". In this mode, all patterns are based on prefix
matches at a directory level. This can then use hashsets for fast
performance -- O(M) instead of O(N*M). My hashset implementation is based on
the virtual filesystem hook in the VFS for Git custom code [4].

In cone mode, a user specifies a list of folders which the user wants every
file inside. In addition, the cone adds all blobs that are siblings of the
folders in the directory path to that folder. This makes the directories
look "hydrated" as a user drills down to those recursively-closed folders.
These directories are called "parent" folders, as a file matches them only
if the file's immediate parent is that directory.

When building a prototype of this feature, I used a separate file to contain
the list of recursively-closed folders and built the hashsets dynamically
based on that file. In this implementation, I tried to maximize the amount
of backwards-compatibility by storing all data in the sparse-checkout file
using patterns recognized by earlier Git versions.

For example, if we add A/B/C as a recursive folder, then we add the
following patterns to the sparse-checkout file:

/*
!/*/
/A/
!/A/*/
/A/B/
!/A/B/*/
/A/B/C/

The alternating positive/negative patterns say "include everything in this
folder, but exclude everything another level deeper". The final pattern has
no matching negation, so is a recursively closed pattern.

Note that I have some basic warnings to try and check that the
sparse-checkout file doesn't match what would be written by a cone-mode add.
In such a case, Git writes a warning to stderr and continues with the old
pattern matching algorithm. These checks are currently very barebones, and
would need to be updated with more robust checks for things like regex
characters in the middle of the pattern. As review moves forward (and if we
don't change the data storage) then we could spend more time on this.

Thanks, -Stolee

Updates in v2, relative to the RFC:

 * Instead of an 'add' subcommand, use a 'set' subcommand. We can consider
   adding 'add' and/or 'remove' subcommands later.
   
   
 * 'set' reads from the arguments by default. '--stdin' option is available.
   
   
 * A new performance-oriented commit is added at the end.
   
   
 * Patterns no longer end with a trailing asterisk except for the first "/*"
   pattern.
   
   
 * References to a "bug" (that was really a strange GVFS interaction in
   microsoft/git) around deleting outside the cone are removed.
   
   

Updates in v3:

 * The bad interaction with "cone mode" and .gitignore files is fixed. A
   test is added in the last patch.
   
   
 * Several patches are added that make the feature more robust. One
   sanitizes user input, another few add progress indicators, and another
   more prevent users from getting in bad states due to working directory
   changes or concurrent processes.
   
   
 * Updated several docs and commit messages according to feedback. Thanks,
   Elijah!
   
   

Things to leave for future patches:

 1. Integrate in 'git worktree add' to copy the sparse-checkout file to a
    worktree-specific file.
    
    
 2. More robustness around detecting non-cone patterns with wildcards in the
    middle of the line.
    
    
 3. 'git clone --sparse-cone' to clone into "cone mode" sparse-checkouts
    (i.e. set 'core.sparseCheckoutCone=true'). This may not be
    super-valuable, as it only starts changing behavior when someone calls
    'git sparse-checkout set', but may be interesting.
    
    

[1] https://git-scm.com/docs/git-read-tree#_sparse_checkoutSparse-checkout
documentation in git-read-tree.

[2] https://stackoverflow.com/a/4909267/127088Is it possible to do a sparse
checkout without checking out the whole repository first?

[3] http://www.marcoyuen.com/articles/2016/06/07/git-sparse.htmlA blog post
of a user's extra "git-sparse" helper.

[4] 
https://github.com/git/git/compare/fc5fd706ff733392053e6180086a4d7f96acc2af...01204f24c5349aa2fb0c474546d768946d315dab
The virtual filesystem hook in microsoft/git.

Derrick Stolee (16):
  sparse-checkout: create builtin with 'list' subcommand
  sparse-checkout: create 'init' subcommand
  clone: add --sparse mode
  sparse-checkout: 'set' subcommand
  sparse-checkout: add '--stdin' option to set subcommand
  sparse-checkout: create 'disable' subcommand
  sparse-checkout: add 'cone' mode
  sparse-checkout: use hashmaps for cone patterns
  sparse-checkout: init and set in cone mode
  unpack-trees: hash less in cone mode
  unpack-trees: add progress to clear_ce_flags()
  read-tree: show progress by default
  sparse-checkout: sanitize for nested folders
  sparse-checkout: update working directory in-process
  sparse-checkout: write using lockfile
  sparse-checkout: cone mode should not interact with .gitignore

Jeff Hostetler (1):
  trace2: add region in clear_ce_flags

 .gitignore                            |   1 +
 Documentation/config/core.txt         |  10 +-
 Documentation/git-clone.txt           |   8 +-
 Documentation/git-read-tree.txt       |   2 +-
 Documentation/git-sparse-checkout.txt | 149 ++++++++
 Makefile                              |   1 +
 builtin.h                             |   1 +
 builtin/clone.c                       |  27 ++
 builtin/read-tree.c                   |   3 +-
 builtin/sparse-checkout.c             | 485 ++++++++++++++++++++++++++
 cache.h                               |   6 +-
 config.c                              |   5 +
 dir.c                                 | 202 ++++++++++-
 dir.h                                 |  34 ++
 environment.c                         |   1 +
 git.c                                 |   1 +
 t/t1091-sparse-checkout-builtin.sh    | 279 +++++++++++++++
 unpack-trees.c                        | 110 ++++--
 unpack-trees.h                        |   3 +-
 19 files changed, 1280 insertions(+), 48 deletions(-)
 create mode 100644 Documentation/git-sparse-checkout.txt
 create mode 100644 builtin/sparse-checkout.c
 create mode 100755 t/t1091-sparse-checkout-builtin.sh


base-commit: 468ce99b77a0efaf1ace4c31a7b0a7d036fd9ca1
Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-316%2Fderrickstolee%2Fsparse-checkout%2Fupstream-v3
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-316/derrickstolee/sparse-checkout/upstream-v3
Pull-Request: https://github.com/gitgitgadget/git/pull/316

Range-diff vs v2:

  1:  dbaf3de88e !  1:  30a0db68cd sparse-checkout: create builtin with 'list' subcommand
     @@ -15,12 +15,6 @@
          builtin will be the preferred mechanism for manipulating the
          sparse-checkout file and syncing the working directory.
      
     -    The `$GIT_DIR/info/sparse-checkout` file defines the skip-
     -    worktree reference bitmap. When Git updates the working
     -    directory, it updates the skip-worktree bits in the index
     -    based on this file and removes or restores files in the
     -    working copy to match.
     -
          The documentation provided is adapted from the "git read-tree"
          documentation with a few edits for clarity in the new context.
          Extra sections are added to hint toward a future change to
     @@ -101,14 +95,9 @@
      +
      +The `$GIT_DIR/info/sparse-checkout` file is used to define the
      +skip-worktree reference bitmap. When Git updates the working
     -+directory, it resets the skip-worktree bit in the index based on this
     -+file. If an entry
     -+matches a pattern in this file, skip-worktree will not be set on
     -+that entry. Otherwise, skip-worktree will be set.
     -+
     -+Then it compares the new skip-worktree value with the previous one. If
     -+skip-worktree turns from set to unset, it will add the corresponding
     -+file back. If it turns from unset to set, that file will be removed.
     ++directory, it updates the skip-worktree bits in the index based
     ++ont this file. The files matching the patterns in the file will
     ++appear in the working directory, and the rest will not.
      +
      +## FULL PATTERN SET
      +
     @@ -136,7 +125,7 @@
      +----------------
      +
      +Then you can disable sparse checkout. Sparse checkout support in 'git
     -+read-tree' and similar commands is disabled by default. You need to
     ++checkout' and similar commands is disabled by default. You need to
      +set `core.sparseCheckout` to `true` in order to have sparse checkout
      +support.
      +
     @@ -333,4 +322,3 @@
      +'
      +
      +test_done
     -+
  2:  412211f5dd !  2:  08bb6fb7f3 sparse-checkout: create 'init' subcommand
     @@ -8,12 +8,12 @@
          an initial set of patterns to the sparse-checkout file, and update
          their working directory.
      
     -    Using 'git read-tree' to clear directories does not work cleanly
     -    on Windows, so manually delete directories that are tracked by Git
     -    before running read-tree.
     +    Make sure to use the `extensions.worktreeConfig` setting and write
     +    the sparse checkout config to the worktree-specific config file.
     +    This avoids confusing interactions with other worktrees.
      
     -    The use of running another process for 'git read-tree' is likely
     -    suboptimal, but that can be improved in a later change, if valuable.
     +    The use of running another process for 'git read-tree' is sub-
     +    optimal. This will be removed in a later change.
      
          Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
      
     @@ -31,6 +31,11 @@
      +	no other directories, then will remove all directories tracked
      +	by Git. Add patterns to the sparse-checkout file to
      +	repopulate the working directory.
     +++
     ++The init subcommand also enables the 'extensions.worktreeConfig' setting
     ++and sets the `core.sparseCheckout` setting in the worktree-specific config
     ++file. This prevents the sparse-checkout feature from interfering with other
     ++worktrees.
       
       SPARSE CHECKOUT
       ----------------
     @@ -66,7 +71,12 @@
      +	return result;
      +}
      +
     -+static int sc_enable_config(void)
     ++enum sparse_checkout_mode {
     ++	MODE_NONE = 0,
     ++	MODE_FULL = 1,
     ++};
     ++
     ++static int sc_set_config(enum sparse_checkout_mode mode)
      +{
      +	struct argv_array argv = ARGV_ARRAY_INIT;
      +
     @@ -75,7 +85,12 @@
      +		return 1;
      +	}
      +
     -+	argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", "true", NULL);
     ++	argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", NULL);
     ++
     ++	if (mode)
     ++		argv_array_pushl(&argv, "true", NULL);
     ++	else
     ++		argv_array_pushl(&argv, "false", NULL);
      +
      +	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
      +		error(_("failed to enable core.sparseCheckout"));
     @@ -92,7 +107,7 @@
      +	FILE *fp;
      +	int res;
      +
     -+	if (sc_enable_config())
     ++	if (sc_set_config(MODE_FULL))
      +		return 1;
      +
      +	memset(&pl, 0, sizeof(pl));
     @@ -178,4 +193,3 @@
      +'
      +
       test_done
     - 
  3:  fef41b794a !  3:  c8587a1fb0 clone: add --sparse mode
     @@ -21,9 +21,9 @@
          point.
      
          During the 'git sparse-checkout init' call, we must first look
     -    to see if HEAD is valid, or else we will fail while trying to
     -    update the working directory. The first checkout will actually
     -    update the working directory correctly.
     +    to see if HEAD is valid, since 'git clone' does not have a valid
     +    HEAD. The first checkout will create the HEAD ref and update the
     +    working directory correctly.
      
          Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
      
     @@ -121,7 +121,7 @@
       	int res;
      +	struct object_id oid;
       
     - 	if (sc_enable_config())
     + 	if (sc_set_config(MODE_FULL))
       		return 1;
      @@
       	fprintf(fp, "/*\n!/*/\n");
     @@ -157,4 +157,3 @@
      +'
      +
       test_done
     - 
  4:  9a78f9ea0f !  4:  6ce1d60b38 sparse-checkout: 'set' subcommand
     @@ -17,8 +17,8 @@
       --- a/Documentation/git-sparse-checkout.txt
       +++ b/Documentation/git-sparse-checkout.txt
      @@
     - 	by Git. Add patterns to the sparse-checkout file to
     - 	repopulate the working directory.
     + file. This prevents the sparse-checkout feature from interfering with other
     + worktrees.
       
      +'set'::
      +	Write a set of patterns to the sparse-checkout file, as given as
     @@ -56,20 +56,24 @@
      +	fclose(fp);
      +	free(sparse_filename);
      +
     -+	clear_pattern_list(pl);
      +	return update_working_directory();
      +}
      +
      +static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
      +{
     ++	static const char *empty_base = "";
      +	int i;
      +	struct pattern_list pl;
     ++	int result;
      +	memset(&pl, 0, sizeof(pl));
      +
      +	for (i = 1; i < argc; i++)
     -+		add_pattern(argv[i], NULL, 0, &pl, 0);
     ++		add_pattern(argv[i], empty_base, 0, &pl, 0);
      +
     -+	return write_patterns_and_update(&pl);
     ++	result = write_patterns_and_update(&pl);
     ++
     ++	clear_pattern_list(&pl);
     ++	return result;
      +}
      +
       int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
     @@ -112,4 +116,3 @@
      +'
      +
       test_done
     - 
  5:  21a0165be7 !  5:  0b1ed06bc8 sparse-checkout: add '--stdin' option to set subcommand
     @@ -14,6 +14,18 @@
       diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
       --- a/builtin/sparse-checkout.c
       +++ b/builtin/sparse-checkout.c
     +@@
     + 	char *sparse_filename;
     + 	FILE *fp;
     + 
     ++	if (!core_apply_sparse_checkout) {
     ++		warning(_("core.sparseCheckout is disabled, so changes to the sparse-checkout file will have no effect"));
     ++		warning(_("run 'git sparse-checkout init' to enable the sparse-checkout feature"));
     ++	}
     ++
     + 	sparse_filename = get_sparse_checkout_filename();
     + 	fp = fopen(sparse_filename, "w");
     + 	write_patterns_to_file(fp, pl);
      @@
       	return update_working_directory();
       }
     @@ -29,8 +41,10 @@
      +
       static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
       {
     + 	static const char *empty_base = "";
       	int i;
       	struct pattern_list pl;
     + 	int result;
      +
      +	static struct option builtin_sparse_checkout_set_options[] = {
      +		OPT_BOOL(0, "stdin", &set_opts.use_stdin,
     @@ -41,7 +55,7 @@
       	memset(&pl, 0, sizeof(pl));
       
      -	for (i = 1; i < argc; i++)
     --		add_pattern(argv[i], NULL, 0, &pl, 0);
     +-		add_pattern(argv[i], empty_base, 0, &pl, 0);
      +	argc = parse_options(argc, argv, prefix,
      +			     builtin_sparse_checkout_set_options,
      +			     builtin_sparse_checkout_set_usage,
     @@ -53,15 +67,15 @@
      +		while (!strbuf_getline(&line, stdin)) {
      +			size_t len;
      +			char *buf = strbuf_detach(&line, &len);
     -+			add_pattern(buf, buf, len, &pl, 0);
     ++			add_pattern(buf, empty_base, 0, &pl, 0);
      +		}
      +	} else {
      +		for (i = 0; i < argc; i++)
     -+			add_pattern(argv[i], argv[i], strlen(argv[i]), &pl, 0);
     ++			add_pattern(argv[i], empty_base, 0, &pl, 0);
      +	}
       
     - 	return write_patterns_and_update(&pl);
     - }
     + 	result = write_patterns_and_update(&pl);
     + 
      
       diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
       --- a/t/t1091-sparse-checkout-builtin.sh
     @@ -70,6 +84,20 @@
       	test_cmp expect dir
       '
       
     ++test_expect_success 'warn if core.sparseCheckout is disabled' '
     ++	test_when_finished git -C repo config --worktree core.sparseCheckout true &&
     ++	git -C repo config --worktree core.sparseCheckout false &&
     ++	git -C repo sparse-checkout set folder1 2>err &&
     ++	test_i18ngrep "core.sparseCheckout is disabled" err
     ++'
     ++
     + test_expect_success 'set sparse-checkout using builtin' '
     + 	git -C repo sparse-checkout set "/*" "!/*/" "*folder*" &&
     + 	cat >expect <<-EOF &&
     +@@
     + 	test_cmp expect dir
     + '
     + 
      +test_expect_success 'set sparse-checkout using --stdin' '
      +	cat >expect <<-EOF &&
      +		/*
     @@ -91,4 +119,3 @@
      +'
      +
       test_done
     - 
  6:  b62b76013f !  6:  22b9bd21f4 sparse-checkout: create 'disable' subcommand
     @@ -24,15 +24,16 @@
       ----------------
       
      @@
     - skip-worktree turns from set to unset, it will add the corresponding
     - file back. If it turns from unset to set, that file will be removed.
     + ont this file. The files matching the patterns in the file will
     + appear in the working directory, and the rest will not.
       
     ++To enable the sparse-checkout feature, run `git sparse-checkout init` to
     ++initialize a simple sparse-checkout file and enable the `core.sparseCheckout`
     ++config setting. Then, run `git sparse-checkout set` to modify the patterns in
     ++the sparse-checkout file.
     ++
      +To repopulate the working directory with all files, use the
      +`git sparse-checkout disable` command.
     -+
     -+Sparse checkout support in 'git checkout' and similar commands is
     -+disabled by default. You need to set `core.sparseCheckout` to `true`
     -+in order to have sparse checkout support.
      +
       ## FULL PATTERN SET
       
     @@ -53,7 +54,7 @@
      -----------------
      -
      -Then you can disable sparse checkout. Sparse checkout support in 'git
     --read-tree' and similar commands is disabled by default. You need to
     +-checkout' and similar commands is disabled by default. You need to
      -set `core.sparseCheckout` to `true` in order to have sparse checkout
      -support.
       
     @@ -76,44 +77,12 @@
       	return result;
       }
       
     --static int sc_enable_config(void)
     -+static int sc_set_config(int mode)
     - {
     - 	struct argv_array argv = ARGV_ARRAY_INIT;
     - 
     -@@
     - 		return 1;
     - 	}
     - 
     --	argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", "true", NULL);
     -+	argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", NULL);
     -+
     -+	if (mode)
     -+		argv_array_pushl(&argv, "true", NULL);
     -+	else
     -+		argv_array_pushl(&argv, "false", NULL);
     - 
     - 	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
     - 		error(_("failed to enable core.sparseCheckout"));
     -@@
     - 	int res;
     - 	struct object_id oid;
     - 
     --	if (sc_enable_config())
     -+	if (sc_set_config(1))
     - 		return 1;
     - 
     - 	memset(&pl, 0, sizeof(pl));
     -@@
     - 	return write_patterns_and_update(&pl);
     - }
     - 
      +static int sparse_checkout_disable(int argc, const char **argv)
      +{
      +	char *sparse_filename;
      +	FILE *fp;
      +
     -+	if (sc_set_config(1))
     ++	if (sc_set_config(MODE_FULL))
      +		die(_("failed to change config"));
      +
      +	sparse_filename = get_sparse_checkout_filename();
     @@ -127,7 +96,7 @@
      +	unlink(sparse_filename);
      +	free(sparse_filename);
      +
     -+	return sc_set_config(0);
     ++	return sc_set_config(MODE_NONE);
      +}
      +
       int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
     @@ -166,4 +135,3 @@
      +'
      +
       test_done
     - 
  7:  25642f8df2 =  7:  2c53ea13d0 trace2: add region in clear_ce_flags
  8:  84511255d1 !  8:  a66ec1affc sparse-checkout: add 'cone' mode
     @@ -11,7 +11,7 @@
          indicate that we expect the sparse-checkout file to contain a
          more limited set of patterns. This is a separate config setting
          from core.sparseCheckout to avoid breaking older clients by
     -    introcuding a tri-state option.
     +    introducing a tri-state option.
      
          The config option does nothing right now, but will be expanded
          upon in a later commit.
     @@ -27,11 +27,14 @@
       core.sparseCheckout::
      -	Enable "sparse checkout" feature. See section "Sparse checkout" in
      -	linkgit:git-read-tree[1] for more information.
     -+	Enable "sparse checkout" feature. If "false", then sparse-checkout
     -+	is disabled. If "true", then sparse-checkout is enabled with the full
     -+	.gitignore pattern set. If "cone", then sparse-checkout is enabled with
     -+	a restricted pattern set. See linkgit:git-sparse-checkout[1] for more
     -+	information.
     ++	Enable "sparse checkout" feature. See linkgit:git-sparse-checkout[1]
     ++	for more information.
     ++
     ++core.sparseCheckoutCone::
     ++	Enables the "cone mode" of the sparse checkout feature. When the
     ++	sparse-checkout file contains a limited set of patterns, then this
     ++	mode provides significant performance advantages. See
     ++	linkgit:git-sparse-checkout[1] for more information.
       
       core.abbrev::
       	Set the length object names are abbreviated to.  If
  9:  95a3285bc6 !  9:  431933bec6 sparse-checkout: use hashmaps for cone patterns
     @@ -64,8 +64,17 @@
      +	if (!pl->use_cone_patterns)
      +		return;
      +
     -+	if (!strcmp(given->pattern, "/*"))
     ++	if (given->flags & PATTERN_FLAG_NEGATIVE &&
     ++	    given->flags & PATTERN_FLAG_MUSTBEDIR &&
     ++	    !strcmp(given->pattern, "/*")) {
     ++		pl->full_cone = 0;
      +		return;
     ++	}
     ++
     ++	if (!given->flags && !strcmp(given->pattern, "/*")) {
     ++		pl->full_cone = 1;
     ++		return;
     ++	}
      +
      +	if (given->patternlen > 2 &&
      +	    !strcmp(given->pattern + given->patternlen - 2, "/*")) {
     @@ -144,6 +153,33 @@
      +	hashmap_entry_init(&p, memhash(p.pattern, p.patternlen));
      +	return !!hashmap_get(map, &p, NULL);
      +}
     ++
     ++int hashmap_contains_parent(struct hashmap *map,
     ++			    const char *path,
     ++			    struct strbuf *buffer)
     ++{
     ++	char *slash_pos;
     ++
     ++	strbuf_setlen(buffer, 0);
     ++
     ++	if (path[0] != '/')
     ++		strbuf_addch(buffer, '/');
     ++
     ++	strbuf_addstr(buffer, path);
     ++
     ++	slash_pos = strrchr(buffer->buf, '/');
     ++
     ++	while (slash_pos > buffer->buf) {
     ++		strbuf_setlen(buffer, slash_pos - buffer->buf);
     ++
     ++		if (hashmap_contains_path(map, buffer))
     ++			return 1;
     ++
     ++		slash_pos = strrchr(buffer->buf, '/');
     ++	}
     ++
     ++	return 0;
     ++}
      +
       void add_pattern(const char *string, const char *base,
       		 int baselen, struct pattern_list *pl, int srcpos)
     @@ -161,7 +197,6 @@
       	int i, lineno = 1;
       	char *entry;
       
     -+	pl->use_cone_patterns = core_sparse_checkout_cone;
      +	hashmap_init(&pl->recursive_hashmap, pl_hashmap_cmp, NULL, 0);
      +	hashmap_init(&pl->parent_hashmap, pl_hashmap_cmp, NULL, 0);
      +
     @@ -194,14 +229,16 @@
      +		}
      +
      +		return UNDECIDED;
     - 	}
     - 
     --	return UNDECIDED;
     ++	}
     ++
     ++	if (pl->full_cone)
     ++		return MATCHED;
     ++
      +	strbuf_addch(&parent_pathname, '/');
      +	strbuf_add(&parent_pathname, pathname, pathlen);
      +
      +	if (hashmap_contains_path(&pl->recursive_hashmap,
     -+					&parent_pathname)) {
     ++				  &parent_pathname)) {
      +		result = MATCHED;
      +		goto done;
      +	}
     @@ -212,8 +249,9 @@
      +		/* include every file in root */
      +		result = MATCHED;
      +		goto done;
     -+	}
     -+
     + 	}
     + 
     +-	return UNDECIDED;
      +	strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf);
      +
      +	if (hashmap_contains_path(&pl->parent_hashmap, &parent_pathname)) {
     @@ -221,19 +259,10 @@
      +		goto done;
      +	}
      +
     -+	while (parent_pathname.len) {
     -+		if (hashmap_contains_path(&pl->recursive_hashmap,
     -+					  &parent_pathname)) {
     -+			result = UNDECIDED;
     -+			goto done;
     -+		}
     -+
     -+		slash_pos = strrchr(parent_pathname.buf, '/');
     -+		if (slash_pos == parent_pathname.buf)
     -+			break;
     -+
     -+		strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf);
     -+	}
     ++	if (hashmap_contains_parent(&pl->recursive_hashmap,
     ++				    pathname,
     ++				    &parent_pathname))
     ++		result = MATCHED;
      +
      +done:
      +	strbuf_release(&parent_pathname);
     @@ -279,6 +308,7 @@
      +	 * excludes array above. If non-zero, that check succeeded.
      +	 */
      +	unsigned use_cone_patterns;
     ++	unsigned full_cone;
      +
      +	/*
      +	 * Stores paths where everything starting with those paths
     @@ -293,6 +323,16 @@
       };
       
       /*
     +@@
     + 		struct index_state *istate,
     + 		const char *name, int *dtype);
     + 
     ++int hashmap_contains_parent(struct hashmap *map,
     ++			    const char *path,
     ++			    struct strbuf *buffer);
     + struct pattern_list *add_pattern_list(struct dir_struct *dir,
     + 				      int group_type, const char *src);
     + int add_patterns_from_file_to_list(const char *fname, const char *base, int baselen,
      
       diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
       --- a/t/t1091-sparse-checkout-builtin.sh
     @@ -322,3 +362,15 @@
       test_expect_success 'sparse-checkout disable' '
       	git -C repo sparse-checkout disable &&
       	test_path_is_missing repo/.git/info/sparse-checkout &&
     +
     + diff --git a/unpack-trees.c b/unpack-trees.c
     + --- a/unpack-trees.c
     + +++ b/unpack-trees.c
     +@@
     + 		o->skip_sparse_checkout = 1;
     + 	if (!o->skip_sparse_checkout) {
     + 		char *sparse = git_pathdup("info/sparse-checkout");
     ++		pl.use_cone_patterns = core_sparse_checkout_cone;
     + 		if (add_patterns_from_file_to_list(sparse, "", 0, &pl, NULL) < 0)
     + 			o->skip_sparse_checkout = 1;
     + 		else
 10:  995c5b8e2b ! 10:  69bd707e96 sparse-checkout: init and set in cone mode
     @@ -32,14 +32,13 @@
       static char const * const builtin_sparse_checkout_usage[] = {
       	N_("git sparse-checkout [init|list|set|disable] <options>"),
      @@
     - 	return result;
     - }
     + enum sparse_checkout_mode {
     + 	MODE_NONE = 0,
     + 	MODE_FULL = 1,
     ++	MODE_CONE = 2,
     + };
       
     -+#define SPARSE_CHECKOUT_NONE 0
     -+#define SPARSE_CHECKOUT_FULL 1
     -+#define SPARSE_CHECKOUT_CONE 2
     -+
     - static int sc_set_config(int mode)
     + static int sc_set_config(enum sparse_checkout_mode mode)
       {
       	struct argv_array argv = ARGV_ARRAY_INIT;
      +	struct argv_array cone_argv = ARGV_ARRAY_INIT;
     @@ -53,7 +52,7 @@
      +	argv_array_pushl(&cone_argv, "config", "--worktree",
      +			 "core.sparseCheckoutCone", NULL);
      +
     -+	if (mode == SPARSE_CHECKOUT_CONE)
     ++	if (mode == MODE_CONE)
      +		argv_array_push(&cone_argv, "true");
      +	else
      +		argv_array_push(&cone_argv, "false");
     @@ -83,19 +82,19 @@
       	int res;
       	struct object_id oid;
      +	int mode;
     -+
     + 
     +-	if (sc_set_config(MODE_FULL))
      +	static struct option builtin_sparse_checkout_init_options[] = {
      +		OPT_BOOL(0, "cone", &init_opts.cone_mode,
      +			 N_("initialize the sparse-checkout in cone mode")),
      +		OPT_END(),
      +	};
     - 
     --	if (sc_set_config(1))
     ++
      +	argc = parse_options(argc, argv, NULL,
      +			     builtin_sparse_checkout_init_options,
      +			     builtin_sparse_checkout_init_usage, 0);
      +
     -+	mode = init_opts.cone_mode ? SPARSE_CHECKOUT_CONE : SPARSE_CHECKOUT_FULL;
     ++	mode = init_opts.cone_mode ? MODE_CONE : MODE_FULL;
      +
      +	if (sc_set_config(mode))
       		return 1;
     @@ -119,7 +118,7 @@
      +		char *oldpattern = e->pattern;
      +		size_t newlen;
      +
     -+		if (!slash)
     ++		if (slash == e->pattern)
      +			break;
      +
      +		newlen = slash - e->pattern;
     @@ -153,7 +152,7 @@
      +		char *pattern = sl.items[i].string;
      +
      +		if (strlen(pattern))
     -+			fprintf(fp, "/%s/\n!/%s/*/\n", pattern, pattern);
     ++			fprintf(fp, "%s/\n!%s/*/\n", pattern, pattern);
      +	}
      +
      +	string_list_clear(&sl, 0);
     @@ -167,7 +166,7 @@
      +
      +	for (i = 0; i < sl.nr; i++) {
      +		char *pattern = sl.items[i].string;
     -+		fprintf(fp, "/%s/\n", pattern);
     ++		fprintf(fp, "%s/\n", pattern);
      +	}
      +}
      +
     @@ -188,7 +187,6 @@
       	fclose(fp);
       	free(sparse_filename);
       
     -@@
       	return update_working_directory();
       }
       
     @@ -201,11 +199,8 @@
      +	if (!line->len)
      +		return;
      +
     -+	if (line->buf[0] == '/')
     -+		strbuf_remove(line, 0, 1);
     -+
     -+	if (!line->len)
     -+		return;
     ++	if (line->buf[0] != '/')
     ++		strbuf_insert(line, 0, "/", 1);
      +
      +	insert_recursive_pattern(pl, line);
      +}
     @@ -224,7 +219,7 @@
      -		while (!strbuf_getline(&line, stdin)) {
      -			size_t len;
      -			char *buf = strbuf_detach(&line, &len);
     --			add_pattern(buf, buf, len, &pl, 0);
     +-			add_pattern(buf, empty_base, 0, &pl, 0);
      +		hashmap_init(&pl.recursive_hashmap, pl_hashmap_cmp, NULL, 0);
      +		hashmap_init(&pl.parent_hashmap, pl_hashmap_cmp, NULL, 0);
      +
     @@ -240,40 +235,22 @@
       		}
       	} else {
      -		for (i = 0; i < argc; i++)
     --			add_pattern(argv[i], argv[i], strlen(argv[i]), &pl, 0);
     +-			add_pattern(argv[i], empty_base, 0, &pl, 0);
      +		if (set_opts.use_stdin) {
      +			struct strbuf line = STRBUF_INIT;
      +
      +			while (!strbuf_getline(&line, stdin)) {
      +				size_t len;
      +				char *buf = strbuf_detach(&line, &len);
     -+				add_pattern(buf, buf, len, &pl, 0);
     ++				add_pattern(buf, empty_base, 0, &pl, 0);
      +			}
      +		} else {
      +			for (i = 0; i < argc; i++)
     -+				add_pattern(argv[i], argv[i], strlen(argv[i]), &pl, 0);
     ++				add_pattern(argv[i], empty_base, 0, &pl, 0);
      +		}
       	}
       
     - 	return write_patterns_and_update(&pl);
     -@@
     - 	char *sparse_filename;
     - 	FILE *fp;
     - 
     --	if (sc_set_config(1))
     -+	if (sc_set_config(SPARSE_CHECKOUT_FULL))
     - 		die(_("failed to change config"));
     - 
     - 	sparse_filename = get_sparse_checkout_filename();
     -@@
     - 	unlink(sparse_filename);
     - 	free(sparse_filename);
     - 
     --	return sc_set_config(0);
     -+	return sc_set_config(SPARSE_CHECKOUT_NONE);
     - }
     - 
     - int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
     + 	result = write_patterns_and_update(&pl);
      
       diff --git a/dir.c b/dir.c
       --- a/dir.c
     @@ -299,10 +276,9 @@
       
      +int pl_hashmap_cmp(const void *unused_cmp_data,
      +		   const void *a, const void *b, const void *key);
     -+
     - struct pattern_list *add_pattern_list(struct dir_struct *dir,
     - 				      int group_type, const char *src);
     - int add_patterns_from_file_to_list(const char *fname, const char *base, int baselen,
     + int hashmap_contains_parent(struct hashmap *map,
     + 			    const char *path,
     + 			    struct strbuf *buffer);
      
       diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
       --- a/t/t1091-sparse-checkout-builtin.sh
     @@ -361,4 +337,3 @@
      +'
      +
       test_done
     - 
 11:  1d4321488e ! 11:  e06349fcec unpack-trees: hash less in cone mode
     @@ -19,21 +19,21 @@
      @@
       
       	if (hashmap_contains_path(&pl->recursive_hashmap,
     - 					&parent_pathname)) {
     + 				  &parent_pathname)) {
      -		result = MATCHED;
      +		result = MATCHED_RECURSIVE;
       		goto done;
       	}
       
      @@
     - 	while (parent_pathname.len) {
     - 		if (hashmap_contains_path(&pl->recursive_hashmap,
     - 					  &parent_pathname)) {
     --			result = UNDECIDED;
     -+			result = MATCHED_RECURSIVE;
     - 			goto done;
     - 		}
     + 	if (hashmap_contains_parent(&pl->recursive_hashmap,
     + 				    pathname,
     + 				    &parent_pathname))
     +-		result = MATCHED;
     ++		result = MATCHED_RECURSIVE;
       
     + done:
     + 	strbuf_release(&parent_pathname);
      
       diff --git a/dir.h b/dir.h
       --- a/dir.h
  -:  ---------- > 12:  3ef32084f5 unpack-trees: add progress to clear_ce_flags()
  -:  ---------- > 13:  3a677f32b6 read-tree: show progress by default
  -:  ---------- > 14:  56444a5498 sparse-checkout: sanitize for nested folders
  -:  ---------- > 15:  a6f17e9a77 sparse-checkout: update working directory in-process
  -:  ---------- > 16:  8927494b8c sparse-checkout: write using lockfile
  -:  ---------- > 17:  7f377c1407 sparse-checkout: cone mode should not interact with .gitignore

-- 
gitgitgadget

^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 01/17] sparse-checkout: create builtin with 'list' subcommand
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-11 22:01       ` Elijah Newren
  2019-10-07 20:08     ` [PATCH v3 02/17] sparse-checkout: create 'init' subcommand Derrick Stolee via GitGitGadget
                       ` (17 subsequent siblings)
  18 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The sparse-checkout feature is mostly hidden to users, as its
only documentation is supplementary information in the docs for
'git read-tree'. In addition, users need to know how to edit the
.git/info/sparse-checkout file with the right patterns, then run
the appropriate 'git read-tree -mu HEAD' command. Keeping the
working directory in sync with the sparse-checkout file requires
care.

Begin an effort to make the sparse-checkout feature a porcelain
feature by creating a new 'git sparse-checkout' builtin. This
builtin will be the preferred mechanism for manipulating the
sparse-checkout file and syncing the working directory.

The documentation provided is adapted from the "git read-tree"
documentation with a few edits for clarity in the new context.
Extra sections are added to hint toward a future change to
a more restricted pattern set.

Helped-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 .gitignore                            |  1 +
 Documentation/git-read-tree.txt       |  2 +-
 Documentation/git-sparse-checkout.txt | 85 ++++++++++++++++++++++++++
 Makefile                              |  1 +
 builtin.h                             |  1 +
 builtin/sparse-checkout.c             | 86 +++++++++++++++++++++++++++
 git.c                                 |  1 +
 t/t1091-sparse-checkout-builtin.sh    | 50 ++++++++++++++++
 8 files changed, 226 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/git-sparse-checkout.txt
 create mode 100644 builtin/sparse-checkout.c
 create mode 100755 t/t1091-sparse-checkout-builtin.sh

diff --git a/.gitignore b/.gitignore
index 4470d7cfc0..5ccc3d00dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -156,6 +156,7 @@
 /git-show-branch
 /git-show-index
 /git-show-ref
+/git-sparse-checkout
 /git-stage
 /git-stash
 /git-status
diff --git a/Documentation/git-read-tree.txt b/Documentation/git-read-tree.txt
index d271842608..da33f84f33 100644
--- a/Documentation/git-read-tree.txt
+++ b/Documentation/git-read-tree.txt
@@ -436,7 +436,7 @@ support.
 SEE ALSO
 --------
 linkgit:git-write-tree[1]; linkgit:git-ls-files[1];
-linkgit:gitignore[5]
+linkgit:gitignore[5]; linkgit:git-sparse-checkout[1];
 
 GIT
 ---
diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
new file mode 100644
index 0000000000..81676b1d33
--- /dev/null
+++ b/Documentation/git-sparse-checkout.txt
@@ -0,0 +1,85 @@
+git-sparse-checkout(1)
+=======================
+
+NAME
+----
+git-sparse-checkout - Initialize and modify the sparse-checkout
+configuration, which reduces the checkout to a set of directories
+given by a list of prefixes.
+
+
+SYNOPSIS
+--------
+[verse]
+'git sparse-checkout <subcommand> [options]'
+
+
+DESCRIPTION
+-----------
+
+Initialize and modify the sparse-checkout configuration, which reduces
+the checkout to a set of directories given by a list of prefixes.
+
+
+COMMANDS
+--------
+'list'::
+	Provide a list of the contents in the sparse-checkout file.
+
+
+SPARSE CHECKOUT
+----------------
+
+"Sparse checkout" allows populating the working directory sparsely.
+It uses the skip-worktree bit (see linkgit:git-update-index[1]) to tell
+Git whether a file in the working directory is worth looking at. If
+the skip-worktree bit is set, then the file is ignored in the working
+directory. Git will not populate the contents of those files, which
+makes a sparse checkout helpful when working in a repository with many
+files, but only a few are important to the current user.
+
+The `$GIT_DIR/info/sparse-checkout` file is used to define the
+skip-worktree reference bitmap. When Git updates the working
+directory, it updates the skip-worktree bits in the index based
+ont this file. The files matching the patterns in the file will
+appear in the working directory, and the rest will not.
+
+## FULL PATTERN SET
+
+By default, the sparse-checkout file uses the same syntax as `.gitignore`
+files.
+
+While `$GIT_DIR/info/sparse-checkout` is usually used to specify what
+files are included, you can also specify what files are _not_ included,
+using negative patterns. For example, to remove the file `unwanted`:
+
+----------------
+/*
+!unwanted
+----------------
+
+Another tricky thing is fully repopulating the working directory when you
+no longer want sparse checkout. You cannot just disable "sparse
+checkout" because skip-worktree bits are still in the index and your working
+directory is still sparsely populated. You should re-populate the working
+directory with the `$GIT_DIR/info/sparse-checkout` file content as
+follows:
+
+----------------
+/*
+----------------
+
+Then you can disable sparse checkout. Sparse checkout support in 'git
+checkout' and similar commands is disabled by default. You need to
+set `core.sparseCheckout` to `true` in order to have sparse checkout
+support.
+
+SEE ALSO
+--------
+
+linkgit:git-read-tree[1]
+linkgit:gitignore[5]
+
+GIT
+---
+Part of the linkgit:git[1] suite
diff --git a/Makefile b/Makefile
index f58bf14c7b..f3322b75dd 100644
--- a/Makefile
+++ b/Makefile
@@ -1121,6 +1121,7 @@ BUILTIN_OBJS += builtin/shortlog.o
 BUILTIN_OBJS += builtin/show-branch.o
 BUILTIN_OBJS += builtin/show-index.o
 BUILTIN_OBJS += builtin/show-ref.o
+BUILTIN_OBJS += builtin/sparse-checkout.o
 BUILTIN_OBJS += builtin/stash.o
 BUILTIN_OBJS += builtin/stripspace.o
 BUILTIN_OBJS += builtin/submodule--helper.o
diff --git a/builtin.h b/builtin.h
index ec7e0954c4..d517068faa 100644
--- a/builtin.h
+++ b/builtin.h
@@ -223,6 +223,7 @@ int cmd_shortlog(int argc, const char **argv, const char *prefix);
 int cmd_show(int argc, const char **argv, const char *prefix);
 int cmd_show_branch(int argc, const char **argv, const char *prefix);
 int cmd_show_index(int argc, const char **argv, const char *prefix);
+int cmd_sparse_checkout(int argc, const char **argv, const char *prefix);
 int cmd_status(int argc, const char **argv, const char *prefix);
 int cmd_stash(int argc, const char **argv, const char *prefix);
 int cmd_stripspace(int argc, const char **argv, const char *prefix);
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
new file mode 100644
index 0000000000..eed9625a05
--- /dev/null
+++ b/builtin/sparse-checkout.c
@@ -0,0 +1,86 @@
+#include "builtin.h"
+#include "config.h"
+#include "dir.h"
+#include "parse-options.h"
+#include "pathspec.h"
+#include "repository.h"
+#include "run-command.h"
+#include "strbuf.h"
+
+static char const * const builtin_sparse_checkout_usage[] = {
+	N_("git sparse-checkout [list]"),
+	NULL
+};
+
+static char *get_sparse_checkout_filename(void)
+{
+	return git_pathdup("info/sparse-checkout");
+}
+
+static void write_patterns_to_file(FILE *fp, struct pattern_list *pl)
+{
+	int i;
+
+	for (i = 0; i < pl->nr; i++) {
+		struct path_pattern *p = pl->patterns[i];
+
+		if (p->flags & PATTERN_FLAG_NEGATIVE)
+			fprintf(fp, "!");
+
+		fprintf(fp, "%s", p->pattern);
+
+		if (p->flags & PATTERN_FLAG_MUSTBEDIR)
+			fprintf(fp, "/");
+
+		fprintf(fp, "\n");
+	}
+}
+
+static int sparse_checkout_list(int argc, const char **argv)
+{
+	struct pattern_list pl;
+	char *sparse_filename;
+	int res;
+
+	memset(&pl, 0, sizeof(pl));
+
+	sparse_filename = get_sparse_checkout_filename();
+	res = add_patterns_from_file_to_list(sparse_filename, "", 0, &pl, NULL);
+	free(sparse_filename);
+
+	if (res < 0) {
+		warning(_("this worktree is not sparse (sparse-checkout file may not exist)"));
+		return 0;
+	}
+
+	write_patterns_to_file(stdout, &pl);
+	clear_pattern_list(&pl);
+
+	return 0;
+}
+
+int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
+{
+	static struct option builtin_sparse_checkout_options[] = {
+		OPT_END(),
+	};
+
+	if (argc == 2 && !strcmp(argv[1], "-h"))
+		usage_with_options(builtin_sparse_checkout_usage,
+				   builtin_sparse_checkout_options);
+
+	argc = parse_options(argc, argv, prefix,
+			     builtin_sparse_checkout_options,
+			     builtin_sparse_checkout_usage,
+			     PARSE_OPT_STOP_AT_NON_OPTION);
+
+	git_config(git_default_config, NULL);
+
+	if (argc > 0) {
+		if (!strcmp(argv[0], "list"))
+			return sparse_checkout_list(argc, argv);
+	}
+
+	usage_with_options(builtin_sparse_checkout_usage,
+			   builtin_sparse_checkout_options);
+}
diff --git a/git.c b/git.c
index c2eec470c9..e775fbad42 100644
--- a/git.c
+++ b/git.c
@@ -576,6 +576,7 @@ static struct cmd_struct commands[] = {
 	{ "show-branch", cmd_show_branch, RUN_SETUP },
 	{ "show-index", cmd_show_index },
 	{ "show-ref", cmd_show_ref, RUN_SETUP },
+	{ "sparse-checkout", cmd_sparse_checkout, RUN_SETUP | NEED_WORK_TREE },
 	{ "stage", cmd_add, RUN_SETUP | NEED_WORK_TREE },
 	/*
 	 * NEEDSWORK: Until the builtin stash is thoroughly robust and no
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
new file mode 100755
index 0000000000..a9b04b1a88
--- /dev/null
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+test_description='sparse checkout builtin tests'
+
+. ./test-lib.sh
+
+test_expect_success 'setup' '
+	git init repo &&
+	(
+		cd repo &&
+		echo "initial" >a &&
+		mkdir folder1 folder2 deep &&
+		mkdir deep/deeper1 deep/deeper2 &&
+		mkdir deep/deeper1/deepest &&
+		cp a folder1 &&
+		cp a folder2 &&
+		cp a deep &&
+		cp a deep/deeper1 &&
+		cp a deep/deeper2 &&
+		cp a deep/deeper1/deepest &&
+		git add . &&
+		git commit -m "initial commit"
+	)
+'
+
+test_expect_success 'git sparse-checkout list (empty)' '
+	git -C repo sparse-checkout list >list 2>err &&
+	test_line_count = 0 list &&
+	test_i18ngrep "this worktree is not sparse (sparse-checkout file may not exist)" err
+'
+
+test_expect_success 'git sparse-checkout list (populated)' '
+	test_when_finished rm -f repo/.git/info/sparse-checkout &&
+	cat >repo/.git/info/sparse-checkout <<-EOF &&
+		/folder1/*
+		/deep/
+		**/a
+		!*bin*
+	EOF
+	git -C repo sparse-checkout list >list &&
+	cat >expect <<-EOF &&
+		/folder1/*
+		/deep/
+		**/a
+		!*bin*
+	EOF
+	test_cmp expect list
+'
+
+test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 02/17] sparse-checkout: create 'init' subcommand
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
  2019-10-07 20:08     ` [PATCH v3 01/17] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-11 22:14       ` Elijah Newren
  2019-10-07 20:08     ` [PATCH v3 03/17] clone: add --sparse mode Derrick Stolee via GitGitGadget
                       ` (16 subsequent siblings)
  18 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

Getting started with a sparse-checkout file can be daunting. Help
users start their sparse enlistment using 'git sparse-checkout init'.
This will set 'core.sparseCheckout=true' in their config, write
an initial set of patterns to the sparse-checkout file, and update
their working directory.

Make sure to use the `extensions.worktreeConfig` setting and write
the sparse checkout config to the worktree-specific config file.
This avoids confusing interactions with other worktrees.

The use of running another process for 'git read-tree' is sub-
optimal. This will be removed in a later change.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-sparse-checkout.txt | 12 ++++
 builtin/sparse-checkout.c             | 79 ++++++++++++++++++++++++++-
 t/t1091-sparse-checkout-builtin.sh    | 41 ++++++++++++++
 3 files changed, 131 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index 81676b1d33..e095c4a98b 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -26,6 +26,18 @@ COMMANDS
 'list'::
 	Provide a list of the contents in the sparse-checkout file.
 
+'init'::
+	Enable the `core.sparseCheckout` setting. If the
+	sparse-checkout file does not exist, then populate it with
+	patterns that match every file in the root directory and
+	no other directories, then will remove all directories tracked
+	by Git. Add patterns to the sparse-checkout file to
+	repopulate the working directory.
++
+The init subcommand also enables the 'extensions.worktreeConfig' setting
+and sets the `core.sparseCheckout` setting in the worktree-specific config
+file. This prevents the sparse-checkout feature from interfering with other
+worktrees.
 
 SPARSE CHECKOUT
 ----------------
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index eed9625a05..64b2bb2b8c 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -8,7 +8,7 @@
 #include "strbuf.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
-	N_("git sparse-checkout [list]"),
+	N_("git sparse-checkout [init|list]"),
 	NULL
 };
 
@@ -59,6 +59,81 @@ static int sparse_checkout_list(int argc, const char **argv)
 	return 0;
 }
 
+static int update_working_directory(void)
+{
+	struct argv_array argv = ARGV_ARRAY_INIT;
+	int result = 0;
+	argv_array_pushl(&argv, "read-tree", "-m", "-u", "HEAD", NULL);
+
+	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to update index with new sparse-checkout paths"));
+		result = 1;
+	}
+
+	argv_array_clear(&argv);
+	return result;
+}
+
+enum sparse_checkout_mode {
+	MODE_NONE = 0,
+	MODE_FULL = 1,
+};
+
+static int sc_set_config(enum sparse_checkout_mode mode)
+{
+	struct argv_array argv = ARGV_ARRAY_INIT;
+
+	if (git_config_set_gently("extensions.worktreeConfig", "true")) {
+		error(_("failed to set extensions.worktreeConfig setting"));
+		return 1;
+	}
+
+	argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", NULL);
+
+	if (mode)
+		argv_array_pushl(&argv, "true", NULL);
+	else
+		argv_array_pushl(&argv, "false", NULL);
+
+	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to enable core.sparseCheckout"));
+		return 1;
+	}
+
+	return 0;
+}
+
+static int sparse_checkout_init(int argc, const char **argv)
+{
+	struct pattern_list pl;
+	char *sparse_filename;
+	FILE *fp;
+	int res;
+
+	if (sc_set_config(MODE_FULL))
+		return 1;
+
+	memset(&pl, 0, sizeof(pl));
+
+	sparse_filename = get_sparse_checkout_filename();
+	res = add_patterns_from_file_to_list(sparse_filename, "", 0, &pl, NULL);
+
+	/* If we already have a sparse-checkout file, use it. */
+	if (res >= 0) {
+		free(sparse_filename);
+		goto reset_dir;
+	}
+
+	/* initial mode: all blobs at root */
+	fp = fopen(sparse_filename, "w");
+	free(sparse_filename);
+	fprintf(fp, "/*\n!/*/\n");
+	fclose(fp);
+
+reset_dir:
+	return update_working_directory();
+}
+
 int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 {
 	static struct option builtin_sparse_checkout_options[] = {
@@ -79,6 +154,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 	if (argc > 0) {
 		if (!strcmp(argv[0], "list"))
 			return sparse_checkout_list(argc, argv);
+		if (!strcmp(argv[0], "init"))
+			return sparse_checkout_init(argc, argv);
 	}
 
 	usage_with_options(builtin_sparse_checkout_usage,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index a9b04b1a88..c70085a759 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -47,4 +47,45 @@ test_expect_success 'git sparse-checkout list (populated)' '
 	test_cmp expect list
 '
 
+test_expect_success 'git sparse-checkout init' '
+	git -C repo sparse-checkout init &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+	EOF
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	git -C repo config --list >config &&
+	test_i18ngrep "core.sparsecheckout=true" config &&
+	ls repo >dir  &&
+	echo a >expect &&
+	test_cmp expect dir
+'
+
+test_expect_success 'git sparse-checkout list after init' '
+	git -C repo sparse-checkout list >actual &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+	EOF
+	test_cmp expect actual
+'
+
+test_expect_success 'init with existing sparse-checkout' '
+	echo "*folder*" >> repo/.git/info/sparse-checkout &&
+	git -C repo sparse-checkout init &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+		*folder*
+	EOF
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 03/17] clone: add --sparse mode
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
  2019-10-07 20:08     ` [PATCH v3 01/17] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
  2019-10-07 20:08     ` [PATCH v3 02/17] sparse-checkout: create 'init' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-11 22:20       ` Elijah Newren
  2019-10-07 20:08     ` [PATCH v3 04/17] sparse-checkout: 'set' subcommand Derrick Stolee via GitGitGadget
                       ` (15 subsequent siblings)
  18 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

When someone wants to clone a large repository, but plans to work
using a sparse-checkout file, they either need to do a full
checkout first and then reduce the patterns they included, or
clone with --no-checkout, set up their patterns, and then run
a checkout manually. This requires knowing a lot about the repo
shape and how sparse-checkout works.

Add a new '--sparse' option to 'git clone' that initializes the
sparse-checkout file to include the following patterns:

	/*
	!/*/

These patterns include every file in the root directory, but
no directories. This allows a repo to include files like a
README or a bootstrapping script to grow enlistments from that
point.

During the 'git sparse-checkout init' call, we must first look
to see if HEAD is valid, since 'git clone' does not have a valid
HEAD. The first checkout will create the HEAD ref and update the
working directory correctly.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-clone.txt        |  8 +++++++-
 builtin/clone.c                    | 27 +++++++++++++++++++++++++++
 builtin/sparse-checkout.c          |  6 ++++++
 t/t1091-sparse-checkout-builtin.sh | 13 +++++++++++++
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt
index 5fc97f14de..03299a8adb 100644
--- a/Documentation/git-clone.txt
+++ b/Documentation/git-clone.txt
@@ -15,7 +15,7 @@ SYNOPSIS
 	  [--dissociate] [--separate-git-dir <git dir>]
 	  [--depth <depth>] [--[no-]single-branch] [--no-tags]
 	  [--recurse-submodules[=<pathspec>]] [--[no-]shallow-submodules]
-	  [--[no-]remote-submodules] [--jobs <n>] [--] <repository>
+	  [--[no-]remote-submodules] [--jobs <n>] [--sparse] [--] <repository>
 	  [<directory>]
 
 DESCRIPTION
@@ -156,6 +156,12 @@ objects from the source repository into a pack in the cloned repository.
 	used, neither remote-tracking branches nor the related
 	configuration variables are created.
 
+--sparse::
+	Initialize the sparse-checkout file so the working
+	directory starts with only the files in the root
+	of the repository. The sparse-checkout file can be
+	modified to grow the working directory as needed.
+
 --mirror::
 	Set up a mirror of the source repository.  This implies `--bare`.
 	Compared to `--bare`, `--mirror` not only maps local branches of the
diff --git a/builtin/clone.c b/builtin/clone.c
index a693e6ca44..16f4e8b6fd 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -58,6 +58,7 @@ static const char *real_git_dir;
 static char *option_upload_pack = "git-upload-pack";
 static int option_verbosity;
 static int option_progress = -1;
+static int option_sparse_checkout;
 static enum transport_family family;
 static struct string_list option_config = STRING_LIST_INIT_NODUP;
 static struct string_list option_required_reference = STRING_LIST_INIT_NODUP;
@@ -145,6 +146,8 @@ static struct option builtin_clone_options[] = {
 	OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
 	OPT_BOOL(0, "remote-submodules", &option_remote_submodules,
 		    N_("any cloned submodules will use their remote-tracking branch")),
+	OPT_BOOL(0, "sparse", &option_sparse_checkout,
+		    N_("initialize sparse-checkout file to include only files at root")),
 	OPT_END()
 };
 
@@ -723,6 +726,27 @@ static void update_head(const struct ref *our, const struct ref *remote,
 	}
 }
 
+static int git_sparse_checkout_init(const char *repo)
+{
+	struct argv_array argv = ARGV_ARRAY_INIT;
+	int result = 0;
+	argv_array_pushl(&argv, "-C", repo, "sparse-checkout", "init", NULL);
+
+	/*
+	 * We must apply the setting in the current process
+	 * for the later checkout to use the sparse-checkout file.
+	 */
+	core_apply_sparse_checkout = 1;
+
+	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to initialize sparse-checkout"));
+		result = 1;
+	}
+
+	argv_array_clear(&argv);
+	return result;
+}
+
 static int checkout(int submodule_progress)
 {
 	struct object_id oid;
@@ -1096,6 +1120,9 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
 	if (option_required_reference.nr || option_optional_reference.nr)
 		setup_reference();
 
+	if (option_sparse_checkout && git_sparse_checkout_init(repo))
+		return 1;
+
 	remote = remote_get(option_origin);
 
 	strbuf_addf(&default_refspec, "+%s*:%s*", src_ref_prefix,
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 64b2bb2b8c..3ecb7ac2e7 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -109,6 +109,7 @@ static int sparse_checkout_init(int argc, const char **argv)
 	char *sparse_filename;
 	FILE *fp;
 	int res;
+	struct object_id oid;
 
 	if (sc_set_config(MODE_FULL))
 		return 1;
@@ -130,6 +131,11 @@ static int sparse_checkout_init(int argc, const char **argv)
 	fprintf(fp, "/*\n!/*/\n");
 	fclose(fp);
 
+	if (get_oid("HEAD", &oid)) {
+		/* assume we are in a fresh repo */
+		return 0;
+	}
+
 reset_dir:
 	return update_working_directory();
 }
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index c70085a759..d4c145a3af 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -88,4 +88,17 @@ test_expect_success 'init with existing sparse-checkout' '
 	test_cmp expect dir
 '
 
+test_expect_success 'clone --sparse' '
+	git clone --sparse repo clone &&
+	git -C clone sparse-checkout list >actual &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+	EOF
+	test_cmp expect actual &&
+	ls clone >dir &&
+	echo a >expect &&
+	test_cmp expect dir
+'
+
 test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 04/17] sparse-checkout: 'set' subcommand
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (2 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 03/17] clone: add --sparse mode Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-11 22:26       ` Elijah Newren
  2019-10-07 20:08     ` [PATCH v3 05/17] sparse-checkout: add '--stdin' option to set subcommand Derrick Stolee via GitGitGadget
                       ` (14 subsequent siblings)
  18 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The 'git sparse-checkout set' subcommand takes a list of patterns
as arguments and writes them to the sparse-checkout file. Then, it
updates the working directory using 'git read-tree -mu HEAD'.

The 'set' subcommand will replace the entire contents of the
sparse-checkout file. The write_patterns_and_update() method is
extracted from cmd_sparse_checkout() to make it easier to implement
'add' and/or 'remove' subcommands in the future.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-sparse-checkout.txt |  5 ++++
 builtin/sparse-checkout.c             | 35 ++++++++++++++++++++++++++-
 t/t1091-sparse-checkout-builtin.sh    | 19 +++++++++++++++
 3 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index e095c4a98b..f4bd951550 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -39,6 +39,11 @@ and sets the `core.sparseCheckout` setting in the worktree-specific config
 file. This prevents the sparse-checkout feature from interfering with other
 worktrees.
 
+'set'::
+	Write a set of patterns to the sparse-checkout file, as given as
+	a list of arguments following the 'set' subcommand. Update the
+	working directory to match the new patterns.
+
 SPARSE CHECKOUT
 ----------------
 
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 3ecb7ac2e7..52d4f832f3 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -8,7 +8,7 @@
 #include "strbuf.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
-	N_("git sparse-checkout [init|list]"),
+	N_("git sparse-checkout [init|list|set] <options>"),
 	NULL
 };
 
@@ -140,6 +140,37 @@ static int sparse_checkout_init(int argc, const char **argv)
 	return update_working_directory();
 }
 
+static int write_patterns_and_update(struct pattern_list *pl)
+{
+	char *sparse_filename;
+	FILE *fp;
+
+	sparse_filename = get_sparse_checkout_filename();
+	fp = fopen(sparse_filename, "w");
+	write_patterns_to_file(fp, pl);
+	fclose(fp);
+	free(sparse_filename);
+
+	return update_working_directory();
+}
+
+static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
+{
+	static const char *empty_base = "";
+	int i;
+	struct pattern_list pl;
+	int result;
+	memset(&pl, 0, sizeof(pl));
+
+	for (i = 1; i < argc; i++)
+		add_pattern(argv[i], empty_base, 0, &pl, 0);
+
+	result = write_patterns_and_update(&pl);
+
+	clear_pattern_list(&pl);
+	return result;
+}
+
 int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 {
 	static struct option builtin_sparse_checkout_options[] = {
@@ -162,6 +193,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 			return sparse_checkout_list(argc, argv);
 		if (!strcmp(argv[0], "init"))
 			return sparse_checkout_init(argc, argv);
+		if (!strcmp(argv[0], "set"))
+			return sparse_checkout_set(argc, argv, prefix);
 	}
 
 	usage_with_options(builtin_sparse_checkout_usage,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index d4c145a3af..19e8673c6b 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -101,4 +101,23 @@ test_expect_success 'clone --sparse' '
 	test_cmp expect dir
 '
 
+test_expect_success 'set sparse-checkout using builtin' '
+	git -C repo sparse-checkout set "/*" "!/*/" "*folder*" &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+		*folder*
+	EOF
+	git -C repo sparse-checkout list >actual &&
+	test_cmp expect actual &&
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 05/17] sparse-checkout: add '--stdin' option to set subcommand
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (3 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 04/17] sparse-checkout: 'set' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-11 22:27       ` Elijah Newren
  2019-10-07 20:08     ` [PATCH v3 06/17] sparse-checkout: create 'disable' subcommand Derrick Stolee via GitGitGadget
                       ` (13 subsequent siblings)
  18 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The 'git sparse-checkout set' subcommand takes a list of patterns
and places them in the sparse-checkout file. Then, it updates the
working directory to match those patterns. For a large list of
patterns, the command-line call can get very cumbersome.

Add a '--stdin' option to instead read patterns over standard in.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 builtin/sparse-checkout.c          | 40 ++++++++++++++++++++++++++++--
 t/t1091-sparse-checkout-builtin.sh | 27 ++++++++++++++++++++
 2 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 52d4f832f3..68f3d8433e 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -145,6 +145,11 @@ static int write_patterns_and_update(struct pattern_list *pl)
 	char *sparse_filename;
 	FILE *fp;
 
+	if (!core_apply_sparse_checkout) {
+		warning(_("core.sparseCheckout is disabled, so changes to the sparse-checkout file will have no effect"));
+		warning(_("run 'git sparse-checkout init' to enable the sparse-checkout feature"));
+	}
+
 	sparse_filename = get_sparse_checkout_filename();
 	fp = fopen(sparse_filename, "w");
 	write_patterns_to_file(fp, pl);
@@ -154,16 +159,47 @@ static int write_patterns_and_update(struct pattern_list *pl)
 	return update_working_directory();
 }
 
+static char const * const builtin_sparse_checkout_set_usage[] = {
+	N_("git sparse-checkout set [--stdin|<patterns>]"),
+	NULL
+};
+
+static struct sparse_checkout_set_opts {
+	int use_stdin;
+} set_opts;
+
 static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
 {
 	static const char *empty_base = "";
 	int i;
 	struct pattern_list pl;
 	int result;
+
+	static struct option builtin_sparse_checkout_set_options[] = {
+		OPT_BOOL(0, "stdin", &set_opts.use_stdin,
+			 N_("read patterns from standard in")),
+		OPT_END(),
+	};
+
 	memset(&pl, 0, sizeof(pl));
 
-	for (i = 1; i < argc; i++)
-		add_pattern(argv[i], empty_base, 0, &pl, 0);
+	argc = parse_options(argc, argv, prefix,
+			     builtin_sparse_checkout_set_options,
+			     builtin_sparse_checkout_set_usage,
+			     PARSE_OPT_KEEP_UNKNOWN);
+
+	if (set_opts.use_stdin) {
+		struct strbuf line = STRBUF_INIT;
+
+		while (!strbuf_getline(&line, stdin)) {
+			size_t len;
+			char *buf = strbuf_detach(&line, &len);
+			add_pattern(buf, empty_base, 0, &pl, 0);
+		}
+	} else {
+		for (i = 0; i < argc; i++)
+			add_pattern(argv[i], empty_base, 0, &pl, 0);
+	}
 
 	result = write_patterns_and_update(&pl);
 
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 19e8673c6b..2a0137fde3 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -101,6 +101,13 @@ test_expect_success 'clone --sparse' '
 	test_cmp expect dir
 '
 
+test_expect_success 'warn if core.sparseCheckout is disabled' '
+	test_when_finished git -C repo config --worktree core.sparseCheckout true &&
+	git -C repo config --worktree core.sparseCheckout false &&
+	git -C repo sparse-checkout set folder1 2>err &&
+	test_i18ngrep "core.sparseCheckout is disabled" err
+'
+
 test_expect_success 'set sparse-checkout using builtin' '
 	git -C repo sparse-checkout set "/*" "!/*/" "*folder*" &&
 	cat >expect <<-EOF &&
@@ -120,4 +127,24 @@ test_expect_success 'set sparse-checkout using builtin' '
 	test_cmp expect dir
 '
 
+test_expect_success 'set sparse-checkout using --stdin' '
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+		/folder1/
+		/folder2/
+	EOF
+	git -C repo sparse-checkout set --stdin <expect &&
+	git -C repo sparse-checkout list >actual &&
+	test_cmp expect actual &&
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 06/17] sparse-checkout: create 'disable' subcommand
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (4 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 05/17] sparse-checkout: add '--stdin' option to set subcommand Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-07 20:08     ` [PATCH v3 07/17] trace2: add region in clear_ce_flags Jeff Hostetler via GitGitGadget
                       ` (12 subsequent siblings)
  18 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The instructions for disabling a sparse-checkout to a full
working directory are complicated and non-intuitive. Add a
subcommand, 'git sparse-checkout disable', to perform those
steps for the user.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-sparse-checkout.txt | 27 ++++++++++++---------------
 builtin/sparse-checkout.c             | 26 +++++++++++++++++++++++++-
 t/t1091-sparse-checkout-builtin.sh    | 15 +++++++++++++++
 3 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index f4bd951550..b12bf385ae 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -44,6 +44,10 @@ worktrees.
 	a list of arguments following the 'set' subcommand. Update the
 	working directory to match the new patterns.
 
+'disable'::
+	Remove the sparse-checkout file, set `core.sparseCheckout` to
+	`false`, and restore the working directory to include all files.
+
 SPARSE CHECKOUT
 ----------------
 
@@ -61,6 +65,14 @@ directory, it updates the skip-worktree bits in the index based
 ont this file. The files matching the patterns in the file will
 appear in the working directory, and the rest will not.
 
+To enable the sparse-checkout feature, run `git sparse-checkout init` to
+initialize a simple sparse-checkout file and enable the `core.sparseCheckout`
+config setting. Then, run `git sparse-checkout set` to modify the patterns in
+the sparse-checkout file.
+
+To repopulate the working directory with all files, use the
+`git sparse-checkout disable` command.
+
 ## FULL PATTERN SET
 
 By default, the sparse-checkout file uses the same syntax as `.gitignore`
@@ -75,21 +87,6 @@ using negative patterns. For example, to remove the file `unwanted`:
 !unwanted
 ----------------
 
-Another tricky thing is fully repopulating the working directory when you
-no longer want sparse checkout. You cannot just disable "sparse
-checkout" because skip-worktree bits are still in the index and your working
-directory is still sparsely populated. You should re-populate the working
-directory with the `$GIT_DIR/info/sparse-checkout` file content as
-follows:
-
-----------------
-/*
-----------------
-
-Then you can disable sparse checkout. Sparse checkout support in 'git
-checkout' and similar commands is disabled by default. You need to
-set `core.sparseCheckout` to `true` in order to have sparse checkout
-support.
 
 SEE ALSO
 --------
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 68f3d8433e..ab02acc125 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -8,7 +8,7 @@
 #include "strbuf.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
-	N_("git sparse-checkout [init|list|set] <options>"),
+	N_("git sparse-checkout [init|list|set|disable] <options>"),
 	NULL
 };
 
@@ -207,6 +207,28 @@ static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
 	return result;
 }
 
+static int sparse_checkout_disable(int argc, const char **argv)
+{
+	char *sparse_filename;
+	FILE *fp;
+
+	if (sc_set_config(MODE_FULL))
+		die(_("failed to change config"));
+
+	sparse_filename = get_sparse_checkout_filename();
+	fp = fopen(sparse_filename, "w");
+	fprintf(fp, "/*\n");
+	fclose(fp);
+
+	if (update_working_directory())
+		die(_("error while refreshing working directory"));
+
+	unlink(sparse_filename);
+	free(sparse_filename);
+
+	return sc_set_config(MODE_NONE);
+}
+
 int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 {
 	static struct option builtin_sparse_checkout_options[] = {
@@ -231,6 +253,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 			return sparse_checkout_init(argc, argv);
 		if (!strcmp(argv[0], "set"))
 			return sparse_checkout_set(argc, argv, prefix);
+		if (!strcmp(argv[0], "disable"))
+			return sparse_checkout_disable(argc, argv);
 	}
 
 	usage_with_options(builtin_sparse_checkout_usage,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 2a0137fde3..52d24c66ba 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -147,4 +147,19 @@ test_expect_success 'set sparse-checkout using --stdin' '
 	test_cmp expect dir
 '
 
+test_expect_success 'sparse-checkout disable' '
+	git -C repo sparse-checkout disable &&
+	test_path_is_missing repo/.git/info/sparse-checkout &&
+	git -C repo config --list >config &&
+	test_i18ngrep "core.sparsecheckout=false" config &&
+	ls repo >dir &&
+	cat >expect <<-EOF &&
+		a
+		deep
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 07/17] trace2: add region in clear_ce_flags
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (5 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 06/17] sparse-checkout: create 'disable' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Jeff Hostetler via GitGitGadget
  2019-10-07 20:08     ` [PATCH v3 08/17] sparse-checkout: add 'cone' mode Derrick Stolee via GitGitGadget
                       ` (11 subsequent siblings)
  18 siblings, 0 replies; 196+ messages in thread
From: Jeff Hostetler via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

When Git updates the working directory with the sparse-checkout
feature enabled, the unpack_trees() method calls clear_ce_flags()
to update the skip-wortree bits on the cache entries. This
check can be expensive, depending on the patterns used.

Add trace2 regions around the method, including some flag
information, so we can get granular performance data during
experiments. This data will be used to measure improvements
to the pattern-matching algorithms for sparse-checkout.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 unpack-trees.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/unpack-trees.c b/unpack-trees.c
index cd548f4fa2..26be8f3569 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1404,15 +1404,23 @@ static int clear_ce_flags(struct index_state *istate,
 			  struct pattern_list *pl)
 {
 	static struct strbuf prefix = STRBUF_INIT;
+	char label[100];
+	int rval;
 
 	strbuf_reset(&prefix);
 
-	return clear_ce_flags_1(istate,
+	xsnprintf(label, sizeof(label), "clear_ce_flags(0x%08lx,0x%08lx)",
+		  (unsigned long)select_mask, (unsigned long)clear_mask);
+	trace2_region_enter("unpack_trees", label, the_repository);
+	rval = clear_ce_flags_1(istate,
 				istate->cache,
 				istate->cache_nr,
 				&prefix,
 				select_mask, clear_mask,
 				pl, 0);
+	trace2_region_leave("unpack_trees", label, the_repository);
+
+	return rval;
 }
 
 /*
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 08/17] sparse-checkout: add 'cone' mode
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (6 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 07/17] trace2: add region in clear_ce_flags Jeff Hostetler via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-07 20:08     ` [PATCH v3 09/17] sparse-checkout: use hashmaps for cone patterns Derrick Stolee via GitGitGadget
                       ` (10 subsequent siblings)
  18 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The sparse-checkout feature can have quadratic performance as
the number of patterns and number of entries in the index grow.
If there are 1,000 patterns and 1,000,000 entries, this time can
be very significant.

Create a new Boolean config option, core.sparseCheckoutCone, to
indicate that we expect the sparse-checkout file to contain a
more limited set of patterns. This is a separate config setting
from core.sparseCheckout to avoid breaking older clients by
introducing a tri-state option.

The config option does nothing right now, but will be expanded
upon in a later commit.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/config/core.txt         | 10 ++++--
 Documentation/git-sparse-checkout.txt | 50 +++++++++++++++++++++++++++
 cache.h                               |  4 ++-
 config.c                              |  5 +++
 environment.c                         |  1 +
 t/t1091-sparse-checkout-builtin.sh    | 14 ++++++++
 6 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index 75538d27e7..e2d343f738 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -591,8 +591,14 @@ core.multiPackIndex::
 	multi-pack-index design document].
 
 core.sparseCheckout::
-	Enable "sparse checkout" feature. See section "Sparse checkout" in
-	linkgit:git-read-tree[1] for more information.
+	Enable "sparse checkout" feature. See linkgit:git-sparse-checkout[1]
+	for more information.
+
+core.sparseCheckoutCone::
+	Enables the "cone mode" of the sparse checkout feature. When the
+	sparse-checkout file contains a limited set of patterns, then this
+	mode provides significant performance advantages. See
+	linkgit:git-sparse-checkout[1] for more information.
 
 core.abbrev::
 	Set the length object names are abbreviated to.  If
diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index b12bf385ae..e2eb82ec45 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -88,6 +88,56 @@ using negative patterns. For example, to remove the file `unwanted`:
 ----------------
 
 
+## CONE PATTERN SET
+
+The full pattern set allows for arbitrary pattern matches and complicated
+inclusion/exclusion rules. These can result in O(N*M) pattern matches when
+updating the index, where N is the number of patterns and M is the number
+of paths in the index. To combat this performance issue, a more restricted
+pattern set is allowed when `core.spareCheckoutCone` is enabled.
+
+The accepted patterns in the cone pattern set are:
+
+1. *Recursive:* All paths inside a directory are included.
+
+2. *Parent:* All files immediately inside a directory are included.
+
+In addition to the above two patterns, we also expect that all files in the
+root directory are included. If a recursive pattern is added, then all
+leading directories are added as parent patterns.
+
+By default, when running `git sparse-checkout init`, the root directory is
+added as a parent pattern. At this point, the sparse-checkout file contains
+the following patterns:
+
+```
+/*
+!/*/
+```
+
+This says "include everything in root, but nothing two levels below root."
+If we then add the folder `A/B/C` as a recursive pattern, the folders `A` and
+`A/B` are added as parent patterns. The resulting sparse-checkout file is
+now
+
+```
+/*
+!/*/
+/A/
+!/A/*/
+/A/B/
+!/A/B/*/
+/A/B/C/
+```
+
+Here, order matters, so the negative patterns are overridden by the positive
+patterns that appear lower in the file.
+
+If `core.sparseCheckoutCone=true`, then Git will parse the sparse-checkout file
+expecting patterns of these types. Git will warn if the patterns do not match.
+If the patterns do match the expected format, then Git will use faster hash-
+based algorithms to compute inclusion in the sparse-checkout.
+
 SEE ALSO
 --------
 
diff --git a/cache.h b/cache.h
index cf5d70c196..8e8ea67efa 100644
--- a/cache.h
+++ b/cache.h
@@ -911,12 +911,14 @@ extern char *git_replace_ref_base;
 
 extern int fsync_object_files;
 extern int core_preload_index;
-extern int core_apply_sparse_checkout;
 extern int precomposed_unicode;
 extern int protect_hfs;
 extern int protect_ntfs;
 extern const char *core_fsmonitor;
 
+int core_apply_sparse_checkout;
+int core_sparse_checkout_cone;
+
 /*
  * Include broken refs in all ref iterations, which will
  * generally choke dangerous operations rather than letting
diff --git a/config.c b/config.c
index 296a6d9cc4..f65c74f5b7 100644
--- a/config.c
+++ b/config.c
@@ -1329,6 +1329,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "core.sparsecheckoutcone")) {
+		core_sparse_checkout_cone = git_config_bool(var, value);
+		return 0;
+	}
+
 	if (!strcmp(var, "core.precomposeunicode")) {
 		precomposed_unicode = git_config_bool(var, value);
 		return 0;
diff --git a/environment.c b/environment.c
index 89af47cb85..670d92bcc0 100644
--- a/environment.c
+++ b/environment.c
@@ -69,6 +69,7 @@ enum object_creation_mode object_creation_mode = OBJECT_CREATION_MODE;
 char *notes_ref_name;
 int grafts_replace_parents = 1;
 int core_apply_sparse_checkout;
+int core_sparse_checkout_cone;
 int merge_log_config = -1;
 int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */
 unsigned long pack_size_limit_cfg;
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 52d24c66ba..36fda5907b 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -147,6 +147,20 @@ test_expect_success 'set sparse-checkout using --stdin' '
 	test_cmp expect dir
 '
 
+test_expect_success 'cone mode: match patterns' '
+	git -C repo config --worktree core.sparseCheckoutCone true &&
+	rm -rf repo/a repo/folder1 repo/folder2 &&
+	git -C repo read-tree -mu HEAD &&
+	git -C repo reset --hard &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_expect_success 'sparse-checkout disable' '
 	git -C repo sparse-checkout disable &&
 	test_path_is_missing repo/.git/info/sparse-checkout &&
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 09/17] sparse-checkout: use hashmaps for cone patterns
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (7 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 08/17] sparse-checkout: add 'cone' mode Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-07 20:08     ` [PATCH v3 10/17] sparse-checkout: init and set in cone mode Derrick Stolee via GitGitGadget
                       ` (9 subsequent siblings)
  18 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The parent and recursive patterns allowed by the "cone mode"
option in sparse-checkout are restrictive enough that we
can avoid using the regex parsing. Everything is based on
prefix matches, so we can use hashsets to store the prefixes
from the sparse-checkout file. When checking a path, we can
strip path entries from the path and check the hashset for
an exact match.

As a test, I created a cone-mode sparse-checkout file for the
Linux repository that actually includes every file. This was
constructed by taking every folder in the Linux repo and creating
the pattern pairs here:

	/$folder/
	!/$folder/*/

This resulted in a sparse-checkout file sith 8,296 patterns.
Running 'git read-tree -mu HEAD' on this file had the following
performance:

	core.sparseCheckout=false: 0.21 s (0.00 s)
	 core.sparseCheckout=true: 3.75 s (3.50 s)
	 core.sparseCheckout=cone: 0.23 s (0.01 s)

The times in parentheses above correspond to the time spent
in the first clear_ce_flags() call, according to the trace2
performance traces.

While this example is contrived, it demonstrates how these
patterns can slow the sparse-checkout feature.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 dir.c                              | 202 +++++++++++++++++++++++++++--
 dir.h                              |  31 +++++
 t/t1091-sparse-checkout-builtin.sh |  11 +-
 unpack-trees.c                     |   1 +
 4 files changed, 236 insertions(+), 9 deletions(-)

diff --git a/dir.c b/dir.c
index 34972abdaf..7ba4bc044e 100644
--- a/dir.c
+++ b/dir.c
@@ -599,6 +599,145 @@ void parse_path_pattern(const char **pattern,
 	*patternlen = len;
 }
 
+static int pl_hashmap_cmp(const void *unused_cmp_data,
+			  const void *a, const void *b, const void *key)
+{
+	const struct pattern_entry *ee1 = (const struct pattern_entry *)a;
+	const struct pattern_entry *ee2 = (const struct pattern_entry *)b;
+
+	size_t min_len = ee1->patternlen <= ee2->patternlen
+			 ? ee1->patternlen
+			 : ee2->patternlen;
+
+	return strncmp(ee1->pattern, ee2->pattern, min_len);
+}
+
+static void add_pattern_to_hashsets(struct pattern_list *pl, struct path_pattern *given)
+{
+	struct pattern_entry *translated;
+	char *truncated;
+	char *data = NULL;
+
+	if (!pl->use_cone_patterns)
+		return;
+
+	if (given->flags & PATTERN_FLAG_NEGATIVE &&
+	    given->flags & PATTERN_FLAG_MUSTBEDIR &&
+	    !strcmp(given->pattern, "/*")) {
+		pl->full_cone = 0;
+		return;
+	}
+
+	if (!given->flags && !strcmp(given->pattern, "/*")) {
+		pl->full_cone = 1;
+		return;
+	}
+
+	if (given->patternlen > 2 &&
+	    !strcmp(given->pattern + given->patternlen - 2, "/*")) {
+		if (!(given->flags & PATTERN_FLAG_NEGATIVE)) {
+			/* Not a cone pattern. */
+			pl->use_cone_patterns = 0;
+			warning(_("unrecognized pattern: '%s'"), given->pattern);
+			goto clear_hashmaps;
+		}
+
+		truncated = xstrdup(given->pattern);
+		truncated[given->patternlen - 2] = 0;
+
+		translated = xmalloc(sizeof(struct pattern_entry));
+		translated->pattern = truncated;
+		translated->patternlen = given->patternlen - 2;
+		hashmap_entry_init(translated,
+				   memhash(translated->pattern, translated->patternlen));
+
+		if (!hashmap_get(&pl->recursive_hashmap, translated, NULL)) {
+			/* We did not see the "parent" included */
+			warning(_("unrecognized negative pattern: '%s'"),
+				given->pattern);
+			free(truncated);
+			free(translated);
+			goto clear_hashmaps;
+		}
+
+		hashmap_add(&pl->parent_hashmap, translated);
+		hashmap_remove(&pl->recursive_hashmap, translated, &data);
+		free(data);
+		return;
+	}
+
+	if (given->flags & PATTERN_FLAG_NEGATIVE) {
+		warning(_("unrecognized negative pattern: '%s'"),
+			given->pattern);
+		goto clear_hashmaps;
+	}
+
+	translated = xmalloc(sizeof(struct pattern_entry));
+
+	translated->pattern = xstrdup(given->pattern);
+	translated->patternlen = given->patternlen;
+	hashmap_entry_init(translated,
+			   memhash(translated->pattern, translated->patternlen));
+
+	hashmap_add(&pl->recursive_hashmap, translated);
+
+	if (hashmap_get(&pl->parent_hashmap, translated, NULL)) {
+		/* we already included this at the parent level */
+		warning(_("your sparse-checkout file may have issues: pattern '%s' is repeated"),
+			given->pattern);
+		hashmap_remove(&pl->parent_hashmap, translated, &data);
+		free(data);
+		free(translated);
+	}
+
+	return;
+
+clear_hashmaps:
+	warning(_("disabling cone pattern matching"));
+	hashmap_free(&pl->parent_hashmap, 1);
+	hashmap_free(&pl->recursive_hashmap, 1);
+	pl->use_cone_patterns = 0;
+}
+
+static int hashmap_contains_path(struct hashmap *map,
+				 struct strbuf *pattern)
+{
+	struct pattern_entry p;
+
+	/* Check straight mapping */
+	p.pattern = pattern->buf;
+	p.patternlen = pattern->len;
+	hashmap_entry_init(&p, memhash(p.pattern, p.patternlen));
+	return !!hashmap_get(map, &p, NULL);
+}
+
+int hashmap_contains_parent(struct hashmap *map,
+			    const char *path,
+			    struct strbuf *buffer)
+{
+	char *slash_pos;
+
+	strbuf_setlen(buffer, 0);
+
+	if (path[0] != '/')
+		strbuf_addch(buffer, '/');
+
+	strbuf_addstr(buffer, path);
+
+	slash_pos = strrchr(buffer->buf, '/');
+
+	while (slash_pos > buffer->buf) {
+		strbuf_setlen(buffer, slash_pos - buffer->buf);
+
+		if (hashmap_contains_path(map, buffer))
+			return 1;
+
+		slash_pos = strrchr(buffer->buf, '/');
+	}
+
+	return 0;
+}
+
 void add_pattern(const char *string, const char *base,
 		 int baselen, struct pattern_list *pl, int srcpos)
 {
@@ -623,6 +762,8 @@ void add_pattern(const char *string, const char *base,
 	ALLOC_GROW(pl->patterns, pl->nr + 1, pl->alloc);
 	pl->patterns[pl->nr++] = pattern;
 	pattern->pl = pl;
+
+	add_pattern_to_hashsets(pl, pattern);
 }
 
 static int read_skip_worktree_file_from_index(const struct index_state *istate,
@@ -848,6 +989,9 @@ static int add_patterns_from_buffer(char *buf, size_t size,
 	int i, lineno = 1;
 	char *entry;
 
+	hashmap_init(&pl->recursive_hashmap, pl_hashmap_cmp, NULL, 0);
+	hashmap_init(&pl->parent_hashmap, pl_hashmap_cmp, NULL, 0);
+
 	pl->filebuf = buf;
 
 	if (skip_utf8_bom(&buf, size))
@@ -1084,16 +1228,58 @@ enum pattern_match_result path_matches_pattern_list(
 				struct index_state *istate)
 {
 	struct path_pattern *pattern;
-	pattern = last_matching_pattern_from_list(pathname, pathlen, basename,
-						  dtype, pl, istate);
-	if (pattern) {
-		if (pattern->flags & PATTERN_FLAG_NEGATIVE)
-			return NOT_MATCHED;
-		else
-			return MATCHED;
+	struct strbuf parent_pathname = STRBUF_INIT;
+	int result = NOT_MATCHED;
+	const char *slash_pos;
+
+	if (!pl->use_cone_patterns) {
+		pattern = last_matching_pattern_from_list(pathname, pathlen, basename,
+							dtype, pl, istate);
+		if (pattern) {
+			if (pattern->flags & PATTERN_FLAG_NEGATIVE)
+				return NOT_MATCHED;
+			else
+				return MATCHED;
+		}
+
+		return UNDECIDED;
+	}
+
+	if (pl->full_cone)
+		return MATCHED;
+
+	strbuf_addch(&parent_pathname, '/');
+	strbuf_add(&parent_pathname, pathname, pathlen);
+
+	if (hashmap_contains_path(&pl->recursive_hashmap,
+				  &parent_pathname)) {
+		result = MATCHED;
+		goto done;
+	}
+
+	slash_pos = strrchr(parent_pathname.buf, '/');
+
+	if (slash_pos == parent_pathname.buf) {
+		/* include every file in root */
+		result = MATCHED;
+		goto done;
 	}
 
-	return UNDECIDED;
+	strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf);
+
+	if (hashmap_contains_path(&pl->parent_hashmap, &parent_pathname)) {
+		result = MATCHED;
+		goto done;
+	}
+
+	if (hashmap_contains_parent(&pl->recursive_hashmap,
+				    pathname,
+				    &parent_pathname))
+		result = MATCHED;
+
+done:
+	strbuf_release(&parent_pathname);
+	return result;
 }
 
 static struct path_pattern *last_matching_pattern_from_lists(
diff --git a/dir.h b/dir.h
index 608696c958..c6c188669a 100644
--- a/dir.h
+++ b/dir.h
@@ -4,6 +4,7 @@
 /* See Documentation/technical/api-directory-listing.txt */
 
 #include "cache.h"
+#include "hashmap.h"
 #include "strbuf.h"
 
 struct dir_entry {
@@ -37,6 +38,13 @@ struct path_pattern {
 	int srcpos;
 };
 
+/* used for hashmaps for cone patterns */
+struct pattern_entry {
+	struct hashmap_entry ent;
+	char *pattern;
+	size_t patternlen;
+};
+
 /*
  * Each excludes file will be parsed into a fresh exclude_list which
  * is appended to the relevant exclude_list_group (either EXC_DIRS or
@@ -55,6 +63,26 @@ struct pattern_list {
 	const char *src;
 
 	struct path_pattern **patterns;
+
+	/*
+	 * While scanning the excludes, we attempt to match the patterns
+	 * with a more restricted set that allows us to use hashsets for
+	 * matching logic, which is faster than the linear lookup in the
+	 * excludes array above. If non-zero, that check succeeded.
+	 */
+	unsigned use_cone_patterns;
+	unsigned full_cone;
+
+	/*
+	 * Stores paths where everything starting with those paths
+	 * is included.
+	 */
+	struct hashmap recursive_hashmap;
+
+	/*
+	 * Used to check single-level parents of blobs.
+	 */
+	struct hashmap parent_hashmap;
 };
 
 /*
@@ -269,6 +297,9 @@ int is_excluded(struct dir_struct *dir,
 		struct index_state *istate,
 		const char *name, int *dtype);
 
+int hashmap_contains_parent(struct hashmap *map,
+			    const char *path,
+			    struct strbuf *buffer);
 struct pattern_list *add_pattern_list(struct dir_struct *dir,
 				      int group_type, const char *src);
 int add_patterns_from_file_to_list(const char *fname, const char *base, int baselen,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 36fda5907b..b0d5aeb33a 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -150,7 +150,8 @@ test_expect_success 'set sparse-checkout using --stdin' '
 test_expect_success 'cone mode: match patterns' '
 	git -C repo config --worktree core.sparseCheckoutCone true &&
 	rm -rf repo/a repo/folder1 repo/folder2 &&
-	git -C repo read-tree -mu HEAD &&
+	git -C repo read-tree -mu HEAD 2>err &&
+	test_i18ngrep ! "disabling cone patterns" err &&
 	git -C repo reset --hard &&
 	ls repo >dir  &&
 	cat >expect <<-EOF &&
@@ -161,6 +162,14 @@ test_expect_success 'cone mode: match patterns' '
 	test_cmp expect dir
 '
 
+test_expect_success 'cone mode: warn on bad pattern' '
+	test_when_finished mv sparse-checkout repo/.git/info/ &&
+	cp repo/.git/info/sparse-checkout . &&
+	echo "!/deep/deeper/*" >>repo/.git/info/sparse-checkout &&
+	git -C repo read-tree -mu HEAD 2>err &&
+	test_i18ngrep "unrecognized negative pattern" err
+'
+
 test_expect_success 'sparse-checkout disable' '
 	git -C repo sparse-checkout disable &&
 	test_path_is_missing repo/.git/info/sparse-checkout &&
diff --git a/unpack-trees.c b/unpack-trees.c
index 26be8f3569..566df11309 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1479,6 +1479,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 		o->skip_sparse_checkout = 1;
 	if (!o->skip_sparse_checkout) {
 		char *sparse = git_pathdup("info/sparse-checkout");
+		pl.use_cone_patterns = core_sparse_checkout_cone;
 		if (add_patterns_from_file_to_list(sparse, "", 0, &pl, NULL) < 0)
 			o->skip_sparse_checkout = 1;
 		else
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 10/17] sparse-checkout: init and set in cone mode
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (8 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 09/17] sparse-checkout: use hashmaps for cone patterns Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-07 20:08     ` [PATCH v3 11/17] unpack-trees: hash less " Derrick Stolee via GitGitGadget
                       ` (8 subsequent siblings)
  18 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

To make the cone pattern set easy to use, update the behavior of
'git sparse-checkout [init|set]'.

Add '--cone' flag to 'git sparse-checkout init' to set the config
option 'core.sparseCheckoutCone=true'.

When running 'git sparse-checkout set' in cone mode, a user only
needs to supply a list of recursive folder matches. Git will
automatically add the necessary parent matches for the leading
directories.

When testing 'git sparse-checkout set' in cone mode, check the
error stream to ensure we do not see any errors. Specifically,
we want to avoid the warning that the patterns do not match
the cone-mode patterns.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 builtin/sparse-checkout.c          | 161 +++++++++++++++++++++++++++--
 dir.c                              |   4 +-
 dir.h                              |   2 +
 t/t1091-sparse-checkout-builtin.sh |  49 +++++++++
 4 files changed, 204 insertions(+), 12 deletions(-)

diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index ab02acc125..b220f15741 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -6,6 +6,7 @@
 #include "repository.h"
 #include "run-command.h"
 #include "strbuf.h"
+#include "string-list.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
 	N_("git sparse-checkout [init|list|set|disable] <options>"),
@@ -77,11 +78,13 @@ static int update_working_directory(void)
 enum sparse_checkout_mode {
 	MODE_NONE = 0,
 	MODE_FULL = 1,
+	MODE_CONE = 2,
 };
 
 static int sc_set_config(enum sparse_checkout_mode mode)
 {
 	struct argv_array argv = ARGV_ARRAY_INIT;
+	struct argv_array cone_argv = ARGV_ARRAY_INIT;
 
 	if (git_config_set_gently("extensions.worktreeConfig", "true")) {
 		error(_("failed to set extensions.worktreeConfig setting"));
@@ -100,9 +103,31 @@ static int sc_set_config(enum sparse_checkout_mode mode)
 		return 1;
 	}
 
+	argv_array_pushl(&cone_argv, "config", "--worktree",
+			 "core.sparseCheckoutCone", NULL);
+
+	if (mode == MODE_CONE)
+		argv_array_push(&cone_argv, "true");
+	else
+		argv_array_push(&cone_argv, "false");
+
+	if (run_command_v_opt(cone_argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to enable core.sparseCheckoutCone"));
+		return 1;
+	}
+
 	return 0;
 }
 
+static char const * const builtin_sparse_checkout_init_usage[] = {
+	N_("git sparse-checkout init [--cone]"),
+	NULL
+};
+
+static struct sparse_checkout_init_opts {
+	int cone_mode;
+} init_opts;
+
 static int sparse_checkout_init(int argc, const char **argv)
 {
 	struct pattern_list pl;
@@ -110,8 +135,21 @@ static int sparse_checkout_init(int argc, const char **argv)
 	FILE *fp;
 	int res;
 	struct object_id oid;
+	int mode;
 
-	if (sc_set_config(MODE_FULL))
+	static struct option builtin_sparse_checkout_init_options[] = {
+		OPT_BOOL(0, "cone", &init_opts.cone_mode,
+			 N_("initialize the sparse-checkout in cone mode")),
+		OPT_END(),
+	};
+
+	argc = parse_options(argc, argv, NULL,
+			     builtin_sparse_checkout_init_options,
+			     builtin_sparse_checkout_init_usage, 0);
+
+	mode = init_opts.cone_mode ? MODE_CONE : MODE_FULL;
+
+	if (sc_set_config(mode))
 		return 1;
 
 	memset(&pl, 0, sizeof(pl));
@@ -140,6 +178,72 @@ static int sparse_checkout_init(int argc, const char **argv)
 	return update_working_directory();
 }
 
+static void insert_recursive_pattern(struct pattern_list *pl, struct strbuf *path)
+{
+	struct pattern_entry *e = xmalloc(sizeof(struct pattern_entry));
+	e->patternlen = path->len;
+	e->pattern = strbuf_detach(path, NULL);
+	hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
+
+	hashmap_add(&pl->recursive_hashmap, e);
+
+	while (e->patternlen) {
+		char *slash = strrchr(e->pattern, '/');
+		char *oldpattern = e->pattern;
+		size_t newlen;
+
+		if (slash == e->pattern)
+			break;
+
+		newlen = slash - e->pattern;
+		e = xmalloc(sizeof(struct pattern_entry));
+		e->patternlen = newlen;
+		e->pattern = xstrndup(oldpattern, newlen);
+		hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
+
+		if (!hashmap_get(&pl->parent_hashmap, e, NULL))
+			hashmap_add(&pl->parent_hashmap, e);
+	}
+}
+
+static void write_cone_to_file(FILE *fp, struct pattern_list *pl)
+{
+	int i;
+	struct pattern_entry *entry;
+	struct hashmap_iter iter;
+	struct string_list sl = STRING_LIST_INIT_DUP;
+
+	hashmap_iter_init(&pl->parent_hashmap, &iter);
+	while ((entry = hashmap_iter_next(&iter)))
+		string_list_insert(&sl, entry->pattern);
+
+	string_list_sort(&sl);
+	string_list_remove_duplicates(&sl, 0);
+
+	fprintf(fp, "/*\n!/*/\n");
+
+	for (i = 0; i < sl.nr; i++) {
+		char *pattern = sl.items[i].string;
+
+		if (strlen(pattern))
+			fprintf(fp, "%s/\n!%s/*/\n", pattern, pattern);
+	}
+
+	string_list_clear(&sl, 0);
+
+	hashmap_iter_init(&pl->recursive_hashmap, &iter);
+	while ((entry = hashmap_iter_next(&iter)))
+		string_list_insert(&sl, entry->pattern);
+
+	string_list_sort(&sl);
+	string_list_remove_duplicates(&sl, 0);
+
+	for (i = 0; i < sl.nr; i++) {
+		char *pattern = sl.items[i].string;
+		fprintf(fp, "%s/\n", pattern);
+	}
+}
+
 static int write_patterns_and_update(struct pattern_list *pl)
 {
 	char *sparse_filename;
@@ -152,13 +256,33 @@ static int write_patterns_and_update(struct pattern_list *pl)
 
 	sparse_filename = get_sparse_checkout_filename();
 	fp = fopen(sparse_filename, "w");
-	write_patterns_to_file(fp, pl);
+
+	if (core_sparse_checkout_cone)
+		write_cone_to_file(fp, pl);
+	else
+		write_patterns_to_file(fp, pl);
+
 	fclose(fp);
 	free(sparse_filename);
 
 	return update_working_directory();
 }
 
+static void strbuf_to_cone_pattern(struct strbuf *line, struct pattern_list *pl)
+{
+	strbuf_trim(line);
+
+	strbuf_trim_trailing_dir_sep(line);
+
+	if (!line->len)
+		return;
+
+	if (line->buf[0] != '/')
+		strbuf_insert(line, 0, "/", 1);
+
+	insert_recursive_pattern(pl, line);
+}
+
 static char const * const builtin_sparse_checkout_set_usage[] = {
 	N_("git sparse-checkout set [--stdin|<patterns>]"),
 	NULL
@@ -188,17 +312,34 @@ static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
 			     builtin_sparse_checkout_set_usage,
 			     PARSE_OPT_KEEP_UNKNOWN);
 
-	if (set_opts.use_stdin) {
+	if (core_sparse_checkout_cone) {
 		struct strbuf line = STRBUF_INIT;
-
-		while (!strbuf_getline(&line, stdin)) {
-			size_t len;
-			char *buf = strbuf_detach(&line, &len);
-			add_pattern(buf, empty_base, 0, &pl, 0);
+		hashmap_init(&pl.recursive_hashmap, pl_hashmap_cmp, NULL, 0);
+		hashmap_init(&pl.parent_hashmap, pl_hashmap_cmp, NULL, 0);
+
+		if (set_opts.use_stdin) {
+			while (!strbuf_getline(&line, stdin))
+				strbuf_to_cone_pattern(&line, &pl);
+		} else {
+			for (i = 0; i < argc; i++) {
+				strbuf_setlen(&line, 0);
+				strbuf_addstr(&line, argv[i]);
+				strbuf_to_cone_pattern(&line, &pl);
+			}
 		}
 	} else {
-		for (i = 0; i < argc; i++)
-			add_pattern(argv[i], empty_base, 0, &pl, 0);
+		if (set_opts.use_stdin) {
+			struct strbuf line = STRBUF_INIT;
+
+			while (!strbuf_getline(&line, stdin)) {
+				size_t len;
+				char *buf = strbuf_detach(&line, &len);
+				add_pattern(buf, empty_base, 0, &pl, 0);
+			}
+		} else {
+			for (i = 0; i < argc; i++)
+				add_pattern(argv[i], empty_base, 0, &pl, 0);
+		}
 	}
 
 	result = write_patterns_and_update(&pl);
diff --git a/dir.c b/dir.c
index 7ba4bc044e..2b240111aa 100644
--- a/dir.c
+++ b/dir.c
@@ -599,8 +599,8 @@ void parse_path_pattern(const char **pattern,
 	*patternlen = len;
 }
 
-static int pl_hashmap_cmp(const void *unused_cmp_data,
-			  const void *a, const void *b, const void *key)
+int pl_hashmap_cmp(const void *unused_cmp_data,
+		   const void *a, const void *b, const void *key)
 {
 	const struct pattern_entry *ee1 = (const struct pattern_entry *)a;
 	const struct pattern_entry *ee2 = (const struct pattern_entry *)b;
diff --git a/dir.h b/dir.h
index c6c188669a..f7a2f000c3 100644
--- a/dir.h
+++ b/dir.h
@@ -297,6 +297,8 @@ int is_excluded(struct dir_struct *dir,
 		struct index_state *istate,
 		const char *name, int *dtype);
 
+int pl_hashmap_cmp(const void *unused_cmp_data,
+		   const void *a, const void *b, const void *key);
 int hashmap_contains_parent(struct hashmap *map,
 			    const char *path,
 			    struct strbuf *buffer);
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index b0d5aeb33a..db6371b079 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -185,4 +185,53 @@ test_expect_success 'sparse-checkout disable' '
 	test_cmp expect dir
 '
 
+test_expect_success 'cone mode: init and set' '
+	git -C repo sparse-checkout init --cone &&
+	git -C repo config --list >config &&
+	test_i18ngrep "core.sparsecheckoutcone=true" config &&
+	ls repo >dir  &&
+	echo a >expect &&
+	test_cmp expect dir &&
+	git -C repo sparse-checkout set deep/deeper1/deepest/ 2>err &&
+	test_line_count = 0 err &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		deep
+	EOF
+	ls repo/deep >dir  &&
+	cat >expect <<-EOF &&
+		a
+		deeper1
+	EOF
+	ls repo/deep/deeper1 >dir  &&
+	cat >expect <<-EOF &&
+		a
+		deepest
+	EOF
+	test_cmp expect dir &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+		/deep/
+		!/deep/*/
+		/deep/deeper1/
+		!/deep/deeper1/*/
+		/deep/deeper1/deepest/
+	EOF
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	git -C repo sparse-checkout set --stdin 2>err <<-EOF &&
+		folder1
+		folder2
+	EOF
+	test_line_count = 0 err &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	ls repo >dir &&
+	test_cmp expect dir
+'
+
 test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 11/17] unpack-trees: hash less in cone mode
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (9 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 10/17] sparse-checkout: init and set in cone mode Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` " Derrick Stolee via GitGitGadget
  2019-10-07 20:08     ` [PATCH v3 13/17] read-tree: show progress by default Derrick Stolee via GitGitGadget
                       ` (7 subsequent siblings)
  18 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The sparse-checkout feature in "cone mode" can use the fact that
the recursive patterns are "connected" to the root via parent
patterns to decide if a directory is entirely contained in the
sparse-checkout or entirely removed.

In these cases, we can skip hashing the paths within those
directories and simply set the skipworktree bit to the correct
value.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 dir.c          |  4 ++--
 dir.h          |  1 +
 unpack-trees.c | 38 +++++++++++++++++++++++---------------
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/dir.c b/dir.c
index 2b240111aa..eb6b2913ef 100644
--- a/dir.c
+++ b/dir.c
@@ -1253,7 +1253,7 @@ enum pattern_match_result path_matches_pattern_list(
 
 	if (hashmap_contains_path(&pl->recursive_hashmap,
 				  &parent_pathname)) {
-		result = MATCHED;
+		result = MATCHED_RECURSIVE;
 		goto done;
 	}
 
@@ -1275,7 +1275,7 @@ enum pattern_match_result path_matches_pattern_list(
 	if (hashmap_contains_parent(&pl->recursive_hashmap,
 				    pathname,
 				    &parent_pathname))
-		result = MATCHED;
+		result = MATCHED_RECURSIVE;
 
 done:
 	strbuf_release(&parent_pathname);
diff --git a/dir.h b/dir.h
index f7a2f000c3..c868129a54 100644
--- a/dir.h
+++ b/dir.h
@@ -262,6 +262,7 @@ enum pattern_match_result {
 	UNDECIDED = -1,
 	NOT_MATCHED = 0,
 	MATCHED = 1,
+	MATCHED_RECURSIVE = 2,
 };
 
 /*
diff --git a/unpack-trees.c b/unpack-trees.c
index 566df11309..b5cf591c38 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1280,15 +1280,17 @@ static int clear_ce_flags_dir(struct index_state *istate,
 	struct cache_entry **cache_end;
 	int dtype = DT_DIR;
 	int rc;
-	enum pattern_match_result ret;
-	ret = path_matches_pattern_list(prefix->buf, prefix->len,
-					basename, &dtype, pl, istate);
+	enum pattern_match_result ret, orig_ret;
+	orig_ret = path_matches_pattern_list(prefix->buf, prefix->len,
+					     basename, &dtype, pl, istate);
 
 	strbuf_addch(prefix, '/');
 
 	/* If undecided, use matching result of parent dir in defval */
-	if (ret == UNDECIDED)
+	if (orig_ret == UNDECIDED)
 		ret = default_match;
+	else
+		ret = orig_ret;
 
 	for (cache_end = cache; cache_end != cache + nr; cache_end++) {
 		struct cache_entry *ce = *cache_end;
@@ -1296,17 +1298,23 @@ static int clear_ce_flags_dir(struct index_state *istate,
 			break;
 	}
 
-	/*
-	 * TODO: check pl, if there are no patterns that may conflict
-	 * with ret (iow, we know in advance the incl/excl
-	 * decision for the entire directory), clear flag here without
-	 * calling clear_ce_flags_1(). That function will call
-	 * the expensive path_matches_pattern_list() on every entry.
-	 */
-	rc = clear_ce_flags_1(istate, cache, cache_end - cache,
-			      prefix,
-			      select_mask, clear_mask,
-			      pl, ret);
+	if (pl->use_cone_patterns && orig_ret == MATCHED_RECURSIVE) {
+		struct cache_entry **ce = cache;
+		rc = (cache_end - cache) / sizeof(struct cache_entry *);
+
+		while (ce < cache_end) {
+			(*ce)->ce_flags &= ~clear_mask;
+			ce++;
+		}
+	} else if (pl->use_cone_patterns && orig_ret == NOT_MATCHED) {
+		rc = (cache_end - cache) / sizeof(struct cache_entry *);
+	} else {
+		rc = clear_ce_flags_1(istate, cache, cache_end - cache,
+				      prefix,
+				      select_mask, clear_mask,
+				      pl, ret);
+	}
+
 	strbuf_setlen(prefix, prefix->len - 1);
 	return rc;
 }
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 12/17] unpack-trees: add progress to clear_ce_flags()
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (11 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 13/17] read-tree: show progress by default Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-07 20:08     ` [PATCH v3 14/17] sparse-checkout: sanitize for nested folders Derrick Stolee via GitGitGadget
                       ` (5 subsequent siblings)
  18 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

When a large repository has many sparse-checkout patterns, the
process for updating the skip-worktree bits can take long enough
that a user gets confused why nothing is happening. Update the
clear_ce_flags() method to write progress.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 cache.h        |  2 ++
 unpack-trees.c | 56 ++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/cache.h b/cache.h
index 8e8ea67efa..9d9d88a0f0 100644
--- a/cache.h
+++ b/cache.h
@@ -328,6 +328,7 @@ static inline unsigned int canon_mode(unsigned int mode)
 
 struct split_index;
 struct untracked_cache;
+struct progress;
 
 struct index_state {
 	struct cache_entry **cache;
@@ -350,6 +351,7 @@ struct index_state {
 	uint64_t fsmonitor_last_update;
 	struct ewah_bitmap *fsmonitor_dirty;
 	struct mem_pool *ce_mem_pool;
+	struct progress *progress;
 };
 
 /* Name hashing */
diff --git a/unpack-trees.c b/unpack-trees.c
index b5cf591c38..edf0fb4673 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1266,7 +1266,8 @@ static int clear_ce_flags_1(struct index_state *istate,
 			    struct strbuf *prefix,
 			    int select_mask, int clear_mask,
 			    struct pattern_list *pl,
-			    enum pattern_match_result default_match);
+			    enum pattern_match_result default_match,
+			    int progress_nr);
 
 /* Whole directory matching */
 static int clear_ce_flags_dir(struct index_state *istate,
@@ -1275,7 +1276,8 @@ static int clear_ce_flags_dir(struct index_state *istate,
 			      char *basename,
 			      int select_mask, int clear_mask,
 			      struct pattern_list *pl,
-			      enum pattern_match_result default_match)
+			      enum pattern_match_result default_match,
+			      int progress_nr)
 {
 	struct cache_entry **cache_end;
 	int dtype = DT_DIR;
@@ -1312,7 +1314,8 @@ static int clear_ce_flags_dir(struct index_state *istate,
 		rc = clear_ce_flags_1(istate, cache, cache_end - cache,
 				      prefix,
 				      select_mask, clear_mask,
-				      pl, ret);
+				      pl, ret,
+				      progress_nr);
 	}
 
 	strbuf_setlen(prefix, prefix->len - 1);
@@ -1339,7 +1342,8 @@ static int clear_ce_flags_1(struct index_state *istate,
 			    struct strbuf *prefix,
 			    int select_mask, int clear_mask,
 			    struct pattern_list *pl,
-			    enum pattern_match_result default_match)
+			    enum pattern_match_result default_match,
+			    int progress_nr)
 {
 	struct cache_entry **cache_end = cache + nr;
 
@@ -1353,8 +1357,11 @@ static int clear_ce_flags_1(struct index_state *istate,
 		int len, dtype;
 		enum pattern_match_result ret;
 
+		display_progress(istate->progress, progress_nr);
+
 		if (select_mask && !(ce->ce_flags & select_mask)) {
 			cache++;
+			progress_nr++;
 			continue;
 		}
 
@@ -1375,20 +1382,26 @@ static int clear_ce_flags_1(struct index_state *istate,
 						       prefix,
 						       prefix->buf + prefix->len - len,
 						       select_mask, clear_mask,
-						       pl, default_match);
+						       pl, default_match,
+						       progress_nr);
 
 			/* clear_c_f_dir eats a whole dir already? */
 			if (processed) {
 				cache += processed;
+				progress_nr += processed;
 				strbuf_setlen(prefix, prefix->len - len);
 				continue;
 			}
 
 			strbuf_addch(prefix, '/');
-			cache += clear_ce_flags_1(istate, cache, cache_end - cache,
-						  prefix,
-						  select_mask, clear_mask, pl,
-						  default_match);
+			processed = clear_ce_flags_1(istate, cache, cache_end - cache,
+						     prefix,
+						     select_mask, clear_mask, pl,
+						     default_match, progress_nr);
+
+			cache += processed;
+			progress_nr += processed;
+
 			strbuf_setlen(prefix, prefix->len - len - 1);
 			continue;
 		}
@@ -1403,19 +1416,27 @@ static int clear_ce_flags_1(struct index_state *istate,
 		if (ret == MATCHED)
 			ce->ce_flags &= ~clear_mask;
 		cache++;
+		progress_nr++;
 	}
+
+	display_progress(istate->progress, progress_nr);
 	return nr - (cache_end - cache);
 }
 
 static int clear_ce_flags(struct index_state *istate,
 			  int select_mask, int clear_mask,
-			  struct pattern_list *pl)
+			  struct pattern_list *pl,
+			  int show_progress)
 {
 	static struct strbuf prefix = STRBUF_INIT;
 	char label[100];
 	int rval;
 
 	strbuf_reset(&prefix);
+	if (show_progress)
+		istate->progress = start_delayed_progress(
+					_("Updating index flags"),
+					istate->cache_nr);
 
 	xsnprintf(label, sizeof(label), "clear_ce_flags(0x%08lx,0x%08lx)",
 		  (unsigned long)select_mask, (unsigned long)clear_mask);
@@ -1425,9 +1446,10 @@ static int clear_ce_flags(struct index_state *istate,
 				istate->cache_nr,
 				&prefix,
 				select_mask, clear_mask,
-				pl, 0);
+				pl, 0, 0);
 	trace2_region_leave("unpack_trees", label, the_repository);
 
+	stop_progress(&istate->progress);
 	return rval;
 }
 
@@ -1436,7 +1458,8 @@ static int clear_ce_flags(struct index_state *istate,
  */
 static void mark_new_skip_worktree(struct pattern_list *pl,
 				   struct index_state *istate,
-				   int select_flag, int skip_wt_flag)
+				   int select_flag, int skip_wt_flag,
+				   int show_progress)
 {
 	int i;
 
@@ -1460,7 +1483,7 @@ static void mark_new_skip_worktree(struct pattern_list *pl,
 	 * 2. Widen worktree according to sparse-checkout file.
 	 * Matched entries will have skip_wt_flag cleared (i.e. "in")
 	 */
-	clear_ce_flags(istate, select_flag, skip_wt_flag, pl);
+	clear_ce_flags(istate, select_flag, skip_wt_flag, pl, show_progress);
 }
 
 static int verify_absent(const struct cache_entry *,
@@ -1522,7 +1545,8 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 	 * Sparse checkout loop #1: set NEW_SKIP_WORKTREE on existing entries
 	 */
 	if (!o->skip_sparse_checkout)
-		mark_new_skip_worktree(o->pl, o->src_index, 0, CE_NEW_SKIP_WORKTREE);
+		mark_new_skip_worktree(o->pl, o->src_index, 0,
+				       CE_NEW_SKIP_WORKTREE, o->verbose_update);
 
 	if (!dfc)
 		dfc = xcalloc(1, cache_entry_size(0));
@@ -1587,7 +1611,9 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 		 * If the will have NEW_SKIP_WORKTREE, also set CE_SKIP_WORKTREE
 		 * so apply_sparse_checkout() won't attempt to remove it from worktree
 		 */
-		mark_new_skip_worktree(o->pl, &o->result, CE_ADDED, CE_SKIP_WORKTREE | CE_NEW_SKIP_WORKTREE);
+		mark_new_skip_worktree(o->pl, &o->result,
+				       CE_ADDED, CE_SKIP_WORKTREE | CE_NEW_SKIP_WORKTREE,
+				       o->verbose_update);
 
 		ret = 0;
 		for (i = 0; i < o->result.cache_nr; i++) {
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 13/17] read-tree: show progress by default
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (10 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 11/17] unpack-trees: hash less " Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-12 22:16       ` Elijah Newren
  2019-10-07 20:08     ` [PATCH v3 12/17] unpack-trees: add progress to clear_ce_flags() Derrick Stolee via GitGitGadget
                       ` (6 subsequent siblings)
  18 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The read-tree builtin has a --verbose option that signals to show
progress and other data while updating the index. Update this to
be on by default when stderr is a terminal window.

This will help tools like 'git sparse-checkout' to automatically
benefit from progress indicators when a user runs these commands.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 builtin/read-tree.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/builtin/read-tree.c b/builtin/read-tree.c
index ca5e655d2f..69963d83dc 100644
--- a/builtin/read-tree.c
+++ b/builtin/read-tree.c
@@ -162,6 +162,7 @@ int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix)
 	opts.head_idx = -1;
 	opts.src_index = &the_index;
 	opts.dst_index = &the_index;
+	opts.verbose_update = isatty(2);
 
 	git_config(git_read_tree_config, NULL);
 
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 14/17] sparse-checkout: sanitize for nested folders
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (12 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 12/17] unpack-trees: add progress to clear_ce_flags() Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-07 20:08     ` [PATCH v3 15/17] sparse-checkout: update working directory in-process Derrick Stolee via GitGitGadget
                       ` (4 subsequent siblings)
  18 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

If a user provides folders A/ and A/B/ for inclusion in a cone-mode
sparse-checkout file, the parsing logic will notice that A/ appears
both as a "parent" type pattern and as a "recursive" type pattern.
This is unexpected and hence will complain via a warning and revert
to the old logic for checking sparse-checkout patterns.

Prevent this from happening accidentally by sanitizing the folders
for this type of inclusion in the 'git sparse-checkout' builtin.
This happens in two ways:

1. Do not include any parent patterns that also appear as recursive
   patterns.

2. Do not include any recursive patterns deeper than other recursive
   patterns.

In order to minimize duplicate code for scanning parents, create
hashmap_contains_parent() method. It takes a strbuf buffer to
avoid reallocating a buffer when calling in a tight loop.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 builtin/sparse-checkout.c          | 22 ++++++++++++++++++----
 t/t1091-sparse-checkout-builtin.sh | 11 +++++++++++
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index b220f15741..25786f8bb0 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -212,10 +212,18 @@ static void write_cone_to_file(FILE *fp, struct pattern_list *pl)
 	struct pattern_entry *entry;
 	struct hashmap_iter iter;
 	struct string_list sl = STRING_LIST_INIT_DUP;
+	struct strbuf parent_pattern = STRBUF_INIT;
 
 	hashmap_iter_init(&pl->parent_hashmap, &iter);
-	while ((entry = hashmap_iter_next(&iter)))
-		string_list_insert(&sl, entry->pattern);
+	while ((entry = hashmap_iter_next(&iter))) {
+		if (hashmap_get(&pl->recursive_hashmap, entry, NULL))
+			continue;
+
+		if (!hashmap_contains_parent(&pl->recursive_hashmap,
+					     entry->pattern,
+					     &parent_pattern))
+			string_list_insert(&sl, entry->pattern);
+	}
 
 	string_list_sort(&sl);
 	string_list_remove_duplicates(&sl, 0);
@@ -232,8 +240,14 @@ static void write_cone_to_file(FILE *fp, struct pattern_list *pl)
 	string_list_clear(&sl, 0);
 
 	hashmap_iter_init(&pl->recursive_hashmap, &iter);
-	while ((entry = hashmap_iter_next(&iter)))
-		string_list_insert(&sl, entry->pattern);
+	while ((entry = hashmap_iter_next(&iter))) {
+		if (!hashmap_contains_parent(&pl->recursive_hashmap,
+					     entry->pattern,
+					     &parent_pattern))
+			string_list_insert(&sl, entry->pattern);
+	}
+
+	strbuf_release(&parent_pattern);
 
 	string_list_sort(&sl);
 	string_list_remove_duplicates(&sl, 0);
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index db6371b079..ee4d361787 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -234,4 +234,15 @@ test_expect_success 'cone mode: init and set' '
 	test_cmp expect dir
 '
 
+test_expect_success 'cone mode: set with nested folders' '
+	git -C repo sparse-checkout set deep deep/deeper1/deepest 2>err &&
+	test_line_count = 0 err &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+		/deep/
+	EOF
+	test_cmp repo/.git/info/sparse-checkout expect
+'
+
 test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 15/17] sparse-checkout: update working directory in-process
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (13 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 14/17] sparse-checkout: sanitize for nested folders Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-12 22:57       ` Elijah Newren
  2019-10-07 20:08     ` [PATCH v3 16/17] sparse-checkout: write using lockfile Derrick Stolee via GitGitGadget
                       ` (3 subsequent siblings)
  18 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The sparse-checkout builtin used 'git read-tree -mu HEAD' to update the
skip-worktree bits in the index and to update the working directory.
This extra process is overly complex, and prone to failure. It also
requires that we write our changes to the sparse-checkout file before
trying to update the index.

Remove this extra process call by creating a direct call to
unpack_trees() in the same way 'git read-tree -mu HEAD' does. In
adition, provide an in-memory list of patterns so we can avoid
reading from the sparse-checkout file. This allows us to test a
proposed change to the file before writing to it.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 builtin/read-tree.c                |  2 +-
 builtin/sparse-checkout.c          | 85 +++++++++++++++++++++++++-----
 t/t1091-sparse-checkout-builtin.sh | 17 ++++++
 unpack-trees.c                     |  5 +-
 unpack-trees.h                     |  3 +-
 5 files changed, 95 insertions(+), 17 deletions(-)

diff --git a/builtin/read-tree.c b/builtin/read-tree.c
index 69963d83dc..d7eeaa26ec 100644
--- a/builtin/read-tree.c
+++ b/builtin/read-tree.c
@@ -186,7 +186,7 @@ int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix)
 
 	if (opts.reset || opts.merge || opts.prefix) {
 		if (read_cache_unmerged() && (opts.prefix || opts.merge))
-			die("You need to resolve your current index first");
+			die(_("You need to resolve your current index first"));
 		stage = opts.merge = 1;
 	}
 	resolve_undo_clear();
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 25786f8bb0..542d57fac6 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -7,6 +7,11 @@
 #include "run-command.h"
 #include "strbuf.h"
 #include "string-list.h"
+#include "cache.h"
+#include "cache-tree.h"
+#include "lockfile.h"
+#include "resolve-undo.h"
+#include "unpack-trees.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
 	N_("git sparse-checkout [init|list|set|disable] <options>"),
@@ -60,18 +65,53 @@ static int sparse_checkout_list(int argc, const char **argv)
 	return 0;
 }
 
-static int update_working_directory(void)
+static int update_working_directory(struct pattern_list *pl)
 {
-	struct argv_array argv = ARGV_ARRAY_INIT;
 	int result = 0;
-	argv_array_pushl(&argv, "read-tree", "-m", "-u", "HEAD", NULL);
+	struct unpack_trees_options o;
+	struct lock_file lock_file = LOCK_INIT;
+	struct object_id oid;
+	struct tree *tree;
+	struct tree_desc t;
+	struct repository *r = the_repository;
 
-	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
-		error(_("failed to update index with new sparse-checkout paths"));
-		result = 1;
+	if (repo_read_index_unmerged(r))
+		die(_("You need to resolve your current index first"));
+
+	if (get_oid("HEAD", &oid))
+		return 0;
+
+	tree = parse_tree_indirect(&oid);
+	parse_tree(tree);
+	init_tree_desc(&t, tree->buffer, tree->size);
+
+	memset(&o, 0, sizeof(o));
+	o.verbose_update = isatty(2);
+	o.merge = 1;
+	o.update = 1;
+	o.fn = oneway_merge;
+	o.head_idx = -1;
+	o.src_index = r->index;
+	o.dst_index = r->index;
+	o.skip_sparse_checkout = 0;
+	o.pl = pl;
+	o.keep_pattern_list = !!pl;
+
+	resolve_undo_clear_index(r->index);
+	setup_work_tree();
+
+	cache_tree_free(&r->index->cache_tree);
+
+	repo_hold_locked_index(r, &lock_file, LOCK_DIE_ON_ERROR);
+
+	core_apply_sparse_checkout = 1;
+	result = unpack_trees(1, &t, &o);
+
+	if (!result) {
+		prime_cache_tree(r, r->index, tree);
+		write_locked_index(r->index, &lock_file, COMMIT_LOCK);
 	}
 
-	argv_array_clear(&argv);
 	return result;
 }
 
@@ -147,7 +187,11 @@ static int sparse_checkout_init(int argc, const char **argv)
 			     builtin_sparse_checkout_init_options,
 			     builtin_sparse_checkout_init_usage, 0);
 
-	mode = init_opts.cone_mode ? MODE_CONE : MODE_FULL;
+	if (init_opts.cone_mode) {
+		mode = MODE_CONE;
+		core_sparse_checkout_cone = 1;
+	} else
+		mode = MODE_FULL;
 
 	if (sc_set_config(mode))
 		return 1;
@@ -175,12 +219,14 @@ static int sparse_checkout_init(int argc, const char **argv)
 	}
 
 reset_dir:
-	return update_working_directory();
+	core_apply_sparse_checkout = 1;
+	return update_working_directory(NULL);
 }
 
 static void insert_recursive_pattern(struct pattern_list *pl, struct strbuf *path)
 {
-	struct pattern_entry *e = xmalloc(sizeof(struct pattern_entry));
+	struct pattern_entry *e = xmalloc(sizeof(*e));
+
 	e->patternlen = path->len;
 	e->pattern = strbuf_detach(path, NULL);
 	hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
@@ -262,12 +308,21 @@ static int write_patterns_and_update(struct pattern_list *pl)
 {
 	char *sparse_filename;
 	FILE *fp;
-
+	int result;
+	
 	if (!core_apply_sparse_checkout) {
 		warning(_("core.sparseCheckout is disabled, so changes to the sparse-checkout file will have no effect"));
 		warning(_("run 'git sparse-checkout init' to enable the sparse-checkout feature"));
 	}
 
+	result = update_working_directory(pl);
+
+	if (result) {
+		clear_pattern_list(pl);
+		update_working_directory(NULL);
+		return result;
+	}
+
 	sparse_filename = get_sparse_checkout_filename();
 	fp = fopen(sparse_filename, "w");
 
@@ -277,9 +332,11 @@ static int write_patterns_and_update(struct pattern_list *pl)
 		write_patterns_to_file(fp, pl);
 
 	fclose(fp);
+
 	free(sparse_filename);
+	clear_pattern_list(pl);
 
-	return update_working_directory();
+	return 0;
 }
 
 static void strbuf_to_cone_pattern(struct strbuf *line, struct pattern_list *pl)
@@ -330,6 +387,7 @@ static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
 		struct strbuf line = STRBUF_INIT;
 		hashmap_init(&pl.recursive_hashmap, pl_hashmap_cmp, NULL, 0);
 		hashmap_init(&pl.parent_hashmap, pl_hashmap_cmp, NULL, 0);
+		pl.use_cone_patterns = 1;
 
 		if (set_opts.use_stdin) {
 			while (!strbuf_getline(&line, stdin))
@@ -375,7 +433,8 @@ static int sparse_checkout_disable(int argc, const char **argv)
 	fprintf(fp, "/*\n");
 	fclose(fp);
 
-	if (update_working_directory())
+	core_apply_sparse_checkout = 1;
+	if (update_working_directory(NULL))
 		die(_("error while refreshing working directory"));
 
 	unlink(sparse_filename);
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index ee4d361787..82eb5fb2f8 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -199,11 +199,13 @@ test_expect_success 'cone mode: init and set' '
 		a
 		deep
 	EOF
+	test_cmp dir expect &&
 	ls repo/deep >dir  &&
 	cat >expect <<-EOF &&
 		a
 		deeper1
 	EOF
+	test_cmp dir expect &&
 	ls repo/deep/deeper1 >dir  &&
 	cat >expect <<-EOF &&
 		a
@@ -245,4 +247,19 @@ test_expect_success 'cone mode: set with nested folders' '
 	test_cmp repo/.git/info/sparse-checkout expect
 '
 
+test_expect_success 'revert to old sparse-checkout on bad update' '
+	echo update >repo/deep/deeper2/a &&
+	cp repo/.git/info/sparse-checkout expect &&
+	test_must_fail git -C repo sparse-checkout set deep/deeper1 2>err &&
+	test_i18ngrep "Cannot update sparse checkout" err &&
+	test_cmp repo/.git/info/sparse-checkout expect &&
+	ls repo/deep >dir &&
+	cat >expect <<-EOF &&
+		a
+		deeper1
+		deeper2
+	EOF
+	test_cmp dir expect
+'
+
 test_done
diff --git a/unpack-trees.c b/unpack-trees.c
index edf0fb4673..f0fee5adf2 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1508,7 +1508,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 	memset(&pl, 0, sizeof(pl));
 	if (!core_apply_sparse_checkout || !o->update)
 		o->skip_sparse_checkout = 1;
-	if (!o->skip_sparse_checkout) {
+	if (!o->skip_sparse_checkout && !o->pl) {
 		char *sparse = git_pathdup("info/sparse-checkout");
 		pl.use_cone_patterns = core_sparse_checkout_cone;
 		if (add_patterns_from_file_to_list(sparse, "", 0, &pl, NULL) < 0)
@@ -1681,7 +1681,8 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 
 done:
 	trace_performance_leave("unpack_trees");
-	clear_pattern_list(&pl);
+	if (!o->keep_pattern_list)
+		clear_pattern_list(&pl);
 	return ret;
 
 return_failed:
diff --git a/unpack-trees.h b/unpack-trees.h
index f2eee0c7c5..ca94a421a5 100644
--- a/unpack-trees.h
+++ b/unpack-trees.h
@@ -59,7 +59,8 @@ struct unpack_trees_options {
 		     quiet,
 		     exiting_early,
 		     show_all_errors,
-		     dry_run;
+		     dry_run,
+		     keep_pattern_list;
 	const char *prefix;
 	int cache_bottom;
 	struct dir_struct *dir;
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 16/17] sparse-checkout: write using lockfile
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (14 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 15/17] sparse-checkout: update working directory in-process Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-12 22:59       ` Elijah Newren
  2019-10-07 20:08     ` [PATCH v3 17/17] sparse-checkout: cone mode should not interact with .gitignore Derrick Stolee via GitGitGadget
                       ` (2 subsequent siblings)
  18 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

If two 'git sparse-checkout set' subcommands are launched at the
same time, the behavior can be unexpected as they compete to write
the sparse-checkout file and update the working directory.

Take a lockfile around the writes to the sparse-checkout file. In
addition, acquire this lock around the working directory update
to avoid two commands updating the working directory in different
ways.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 builtin/sparse-checkout.c          | 15 ++++++++++++---
 t/t1091-sparse-checkout-builtin.sh |  7 +++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 542d57fac6..9b313093cd 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -308,6 +308,8 @@ static int write_patterns_and_update(struct pattern_list *pl)
 {
 	char *sparse_filename;
 	FILE *fp;
+	int fd;
+	struct lock_file lk = LOCK_INIT;
 	int result;
 	
 	if (!core_apply_sparse_checkout) {
@@ -317,21 +319,28 @@ static int write_patterns_and_update(struct pattern_list *pl)
 
 	result = update_working_directory(pl);
 
+	sparse_filename = get_sparse_checkout_filename();
+	fd = hold_lock_file_for_update(&lk, sparse_filename,
+				      LOCK_DIE_ON_ERROR);
+
+	result = update_working_directory(pl);
 	if (result) {
+		rollback_lock_file(&lk);
+		free(sparse_filename);
 		clear_pattern_list(pl);
 		update_working_directory(NULL);
 		return result;
 	}
 
-	sparse_filename = get_sparse_checkout_filename();
-	fp = fopen(sparse_filename, "w");
+	fp = fdopen(fd, "w");
 
 	if (core_sparse_checkout_cone)
 		write_cone_to_file(fp, pl);
 	else
 		write_patterns_to_file(fp, pl);
 
-	fclose(fp);
+	fflush(fp);
+	commit_lock_file(&lk);
 
 	free(sparse_filename);
 	clear_pattern_list(pl);
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 82eb5fb2f8..f22a4afbea 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -262,4 +262,11 @@ test_expect_success 'revert to old sparse-checkout on bad update' '
 	test_cmp dir expect
 '
 
+test_expect_success 'fail when lock is taken' '
+	test_when_finished rm -rf repo/.git/info/sparse-checkout.lock &&
+	touch repo/.git/info/sparse-checkout.lock &&
+	test_must_fail git -C repo sparse-checkout set deep 2>err &&
+	test_i18ngrep "File exists" err
+'
+
 test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v3 17/17] sparse-checkout: cone mode should not interact with .gitignore
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (15 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 16/17] sparse-checkout: write using lockfile Derrick Stolee via GitGitGadget
@ 2019-10-07 20:08     ` Derrick Stolee via GitGitGadget
  2019-10-12 23:00       ` Elijah Newren
  2019-10-12 23:22     ` [PATCH v3 00/17] New sparse-checkout builtin and "cone" mode Elijah Newren
  2019-10-15 13:55     ` [PATCH v4 " Derrick Stolee via GitGitGadget
  18 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-07 20:08 UTC (permalink / raw)
  To: git; +Cc: newren, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

During the development of the sparse-checkout "cone mode" feature,
an incorrect placement of the initializer for "use_cone_patterns = 1"
caused warnings to show up when a .gitignore file was present with
non-cone-mode patterns. This was fixed in the original commit
introducing the cone mode, but now we should add a test to avoid
hitting this problem again in the future.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 t/t1091-sparse-checkout-builtin.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index f22a4afbea..ed9355384a 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -269,4 +269,11 @@ test_expect_success 'fail when lock is taken' '
 	test_i18ngrep "File exists" err
 '
 
+test_expect_success '.gitignore should not warn about cone mode' '
+	git -C repo config --worktree core.sparseCheckoutCone true &&
+	echo "**/bin/*" >repo/.gitignore &&
+	git -C repo reset --hard 2>err &&
+	test_i18ngrep ! "disabling cone patterns" err
+'
+
 test_done
-- 
gitgitgadget

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 01/17] sparse-checkout: create builtin with 'list' subcommand
  2019-10-07 20:08     ` [PATCH v3 01/17] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-11 22:01       ` Elijah Newren
  0 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-10-11 22:01 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
> +SPARSE CHECKOUT
> +----------------
> +
> +"Sparse checkout" allows populating the working directory sparsely.
> +It uses the skip-worktree bit (see linkgit:git-update-index[1]) to tell
> +Git whether a file in the working directory is worth looking at. If
> +the skip-worktree bit is set, then the file is ignored in the working
> +directory. Git will not populate the contents of those files, which
> +makes a sparse checkout helpful when working in a repository with many
> +files, but only a few are important to the current user.
> +
> +The `$GIT_DIR/info/sparse-checkout` file is used to define the
> +skip-worktree reference bitmap. When Git updates the working
> +directory, it updates the skip-worktree bits in the index based
> +ont this file. The files matching the patterns in the file will

s/ont/on/

> +appear in the working directory, and the rest will not.
> +
> +## FULL PATTERN SET
> +
> +By default, the sparse-checkout file uses the same syntax as `.gitignore`
> +files.
> +
> +While `$GIT_DIR/info/sparse-checkout` is usually used to specify what
> +files are included, you can also specify what files are _not_ included,
> +using negative patterns. For example, to remove the file `unwanted`:
> +
> +----------------
> +/*
> +!unwanted
> +----------------
> +
> +Another tricky thing is fully repopulating the working directory when you
> +no longer want sparse checkout. You cannot just disable "sparse
> +checkout" because skip-worktree bits are still in the index and your working
> +directory is still sparsely populated. You should re-populate the working
> +directory with the `$GIT_DIR/info/sparse-checkout` file content as
> +follows:
> +
> +----------------
> +/*
> +----------------
> +
> +Then you can disable sparse checkout. Sparse checkout support in 'git
> +checkout' and similar commands is disabled by default. You need to
> +set `core.sparseCheckout` to `true` in order to have sparse checkout
> +support.

Looks like these disappear by the end of the series, so no need to
comment on them.  Thanks for all the fixes, other than the trivial
typo above, this patch looks good.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 02/17] sparse-checkout: create 'init' subcommand
  2019-10-07 20:08     ` [PATCH v3 02/17] sparse-checkout: create 'init' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-11 22:14       ` Elijah Newren
  2019-10-14 20:22         ` Derrick Stolee
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-10-11 22:14 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
> ++
> +The init subcommand also enables the 'extensions.worktreeConfig' setting
> +and sets the `core.sparseCheckout` setting in the worktree-specific config
> +file. This prevents the sparse-checkout feature from interfering with other
> +worktrees.

I'm afraid that might be mis-parsed by future readers.  Perhaps something like:

The init subcommand also enables the `core.sparseCheckout` setting.
To avoid interfering with other worktrees, it first enables the
`extensions.worktreeConfig` setting and makes sure to set the
`core.sparseCheckout` setting in the worktree-specific config file.

> +enum sparse_checkout_mode {
> +       MODE_NONE = 0,
> +       MODE_FULL = 1,
> +};

So MODE_FULL is "true" and MODE_NONE is "false".  MODE_NONE seems
confusing to me, but let's keep reading...

> +
> +static int sc_set_config(enum sparse_checkout_mode mode)
> +{
> +       struct argv_array argv = ARGV_ARRAY_INIT;
> +
> +       if (git_config_set_gently("extensions.worktreeConfig", "true")) {
> +               error(_("failed to set extensions.worktreeConfig setting"));
> +               return 1;
> +       }
> +
> +       argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", NULL);
> +
> +       if (mode)
> +               argv_array_pushl(&argv, "true", NULL);
> +       else
> +               argv_array_pushl(&argv, "false", NULL);

Wait, what?  MODE_FULL is used to specify that you want a sparse
checkout, and MODE_NONE is used to denote that you want a full (i.e.
non-sparse) checkout?  These are *very* confusing names.


> +static int sparse_checkout_init(int argc, const char **argv)
> +{
> +       struct pattern_list pl;
> +       char *sparse_filename;
> +       FILE *fp;
> +       int res;
> +
> +       if (sc_set_config(MODE_FULL))
> +               return 1;

Seems confusing here too.


Everything else in the patch looks good, though.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 03/17] clone: add --sparse mode
  2019-10-07 20:08     ` [PATCH v3 03/17] clone: add --sparse mode Derrick Stolee via GitGitGadget
@ 2019-10-11 22:20       ` Elijah Newren
  0 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-10-11 22:20 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
> During the 'git sparse-checkout init' call, we must first look
> to see if HEAD is valid, since 'git clone' does not have a valid
> HEAD.

...does not have a valid HEAD by the time git_sparse_checkout_init() is called?

> The first checkout will create the HEAD ref and update the
> working directory correctly.

Is this checkout you reference a manual-initiated user checkout after
the clone, or the checkout performed as part of the clone?  (I'm
almost certain it's the latter, but your wording makes me question.)

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v2 04/11] sparse-checkout: 'set' subcommand
  2019-10-07 18:26         ` Derrick Stolee
@ 2019-10-11 22:24           ` Elijah Newren
  0 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-10-11 22:24 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Derrick Stolee via GitGitGadget, Git Mailing List,
	Junio C Hamano, Derrick Stolee

On Mon, Oct 7, 2019 at 11:26 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 10/5/2019 8:30 PM, Elijah Newren wrote:
> > On Sat, Oct 5, 2019 at 3:44 PM Elijah Newren <newren@gmail.com> wrote:
> >>
> >> On Thu, Sep 19, 2019 at 3:07 PM Derrick Stolee via GitGitGadget
> >> <gitgitgadget@gmail.com> wrote:
> >>> +static int write_patterns_and_update(struct pattern_list *pl)
> >>> +{
> >>> +       char *sparse_filename;
> >>> +       FILE *fp;
> >>> +
> >>> +       sparse_filename = get_sparse_checkout_filename();
> >>> +       fp = fopen(sparse_filename, "w");
> >>> +       write_patterns_to_file(fp, pl);
> >>> +       fclose(fp);
> >>> +       free(sparse_filename);
> >>> +
> >>> +       clear_pattern_list(pl);
> >>
> >> It seems slightly odd that pl is passed in but cleared in this
> >> function rather than in the caller that created pl.  Should this be
> >> moved to the caller, or, alternatively, a comment added to explain
> >> this side-effect for future callers of the function?
> >>
> >> The rest of the patch looked good to me.
> >
> > Actually, thought of something else.  What if the user calls 'git
> > sparse-checkout set ...' without first calling 'git sparse-checkout
> > init'?  Should that report an error to the user, a suggestion to
> > follow it up with 'sparse-checkout init', or should it just call
> > sc_set_config() behind the scenes and allow bypassing the init
> > subcommand?
>
> Maybe a warning would suffice. I still think the workflow of the
> following is most correct, and not difficult to recommend:
>
> * "git sparse-checkout init [--cone]" -OR- "git clone --sparse"
> * git sparse-checkout set [stuff]
> * git sparse-checkout disable

Recommending the right thing is easy, but users will call things out
of order despite documentation.  If they call disable before init, I
see no problems that will lead to confusion.  If they call set without
calling init, I can see them being surprised...so I commented on it
and asked if we want a warning or whatever.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 04/17] sparse-checkout: 'set' subcommand
  2019-10-07 20:08     ` [PATCH v3 04/17] sparse-checkout: 'set' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-11 22:26       ` Elijah Newren
  2019-10-11 22:30         ` Elijah Newren
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-10-11 22:26 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> The 'git sparse-checkout set' subcommand takes a list of patterns
> as arguments and writes them to the sparse-checkout file. Then, it
> updates the working directory using 'git read-tree -mu HEAD'.
>
> The 'set' subcommand will replace the entire contents of the
> sparse-checkout file. The write_patterns_and_update() method is
> extracted from cmd_sparse_checkout() to make it easier to implement
> 'add' and/or 'remove' subcommands in the future.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  Documentation/git-sparse-checkout.txt |  5 ++++
>  builtin/sparse-checkout.c             | 35 ++++++++++++++++++++++++++-
>  t/t1091-sparse-checkout-builtin.sh    | 19 +++++++++++++++
>  3 files changed, 58 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
> index e095c4a98b..f4bd951550 100644
> --- a/Documentation/git-sparse-checkout.txt
> +++ b/Documentation/git-sparse-checkout.txt
> @@ -39,6 +39,11 @@ and sets the `core.sparseCheckout` setting in the worktree-specific config
>  file. This prevents the sparse-checkout feature from interfering with other
>  worktrees.
>
> +'set'::
> +       Write a set of patterns to the sparse-checkout file, as given as
> +       a list of arguments following the 'set' subcommand. Update the
> +       working directory to match the new patterns.
> +
>  SPARSE CHECKOUT
>  ----------------
>
> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
> index 3ecb7ac2e7..52d4f832f3 100644
> --- a/builtin/sparse-checkout.c
> +++ b/builtin/sparse-checkout.c
> @@ -8,7 +8,7 @@
>  #include "strbuf.h"
>
>  static char const * const builtin_sparse_checkout_usage[] = {
> -       N_("git sparse-checkout [init|list]"),
> +       N_("git sparse-checkout [init|list|set] <options>"),
>         NULL
>  };
>
> @@ -140,6 +140,37 @@ static int sparse_checkout_init(int argc, const char **argv)
>         return update_working_directory();
>  }
>
> +static int write_patterns_and_update(struct pattern_list *pl)
> +{
> +       char *sparse_filename;
> +       FILE *fp;
> +
> +       sparse_filename = get_sparse_checkout_filename();
> +       fp = fopen(sparse_filename, "w");
> +       write_patterns_to_file(fp, pl);
> +       fclose(fp);
> +       free(sparse_filename);
> +
> +       return update_working_directory();
> +}
> +
> +static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
> +{
> +       static const char *empty_base = "";
> +       int i;
> +       struct pattern_list pl;
> +       int result;
> +       memset(&pl, 0, sizeof(pl));
> +
> +       for (i = 1; i < argc; i++)
> +               add_pattern(argv[i], empty_base, 0, &pl, 0);
> +
> +       result = write_patterns_and_update(&pl);
> +
> +       clear_pattern_list(&pl);
> +       return result;
> +}
> +
>  int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
>  {
>         static struct option builtin_sparse_checkout_options[] = {
> @@ -162,6 +193,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
>                         return sparse_checkout_list(argc, argv);
>                 if (!strcmp(argv[0], "init"))
>                         return sparse_checkout_init(argc, argv);
> +               if (!strcmp(argv[0], "set"))
> +                       return sparse_checkout_set(argc, argv, prefix);
>         }
>
>         usage_with_options(builtin_sparse_checkout_usage,
> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> index d4c145a3af..19e8673c6b 100755
> --- a/t/t1091-sparse-checkout-builtin.sh
> +++ b/t/t1091-sparse-checkout-builtin.sh
> @@ -101,4 +101,23 @@ test_expect_success 'clone --sparse' '
>         test_cmp expect dir
>  '
>
> +test_expect_success 'set sparse-checkout using builtin' '
> +       git -C repo sparse-checkout set "/*" "!/*/" "*folder*" &&
> +       cat >expect <<-EOF &&
> +               /*
> +               !/*/
> +               *folder*
> +       EOF
> +       git -C repo sparse-checkout list >actual &&
> +       test_cmp expect actual &&
> +       test_cmp expect repo/.git/info/sparse-checkout &&
> +       ls repo >dir  &&
> +       cat >expect <<-EOF &&
> +               a
> +               folder1
> +               folder2
> +       EOF
> +       test_cmp expect dir
> +'
> +
>  test_done
> --

Looks good, thanks for the fixes.  I'm still slightly worried about
folks not looking at the docs and calling sparse-checkout set without
calling init, and then being negatively surprised.  It's a minor
issue, but a warning might be helpful.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 05/17] sparse-checkout: add '--stdin' option to set subcommand
  2019-10-07 20:08     ` [PATCH v3 05/17] sparse-checkout: add '--stdin' option to set subcommand Derrick Stolee via GitGitGadget
@ 2019-10-11 22:27       ` Elijah Newren
  2019-10-14 20:28         ` Derrick Stolee
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-10-11 22:27 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> The 'git sparse-checkout set' subcommand takes a list of patterns
> and places them in the sparse-checkout file. Then, it updates the
> working directory to match those patterns. For a large list of
> patterns, the command-line call can get very cumbersome.
>
> Add a '--stdin' option to instead read patterns over standard in.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  builtin/sparse-checkout.c          | 40 ++++++++++++++++++++++++++++--
>  t/t1091-sparse-checkout-builtin.sh | 27 ++++++++++++++++++++
>  2 files changed, 65 insertions(+), 2 deletions(-)
>
> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
> index 52d4f832f3..68f3d8433e 100644
> --- a/builtin/sparse-checkout.c
> +++ b/builtin/sparse-checkout.c
> @@ -145,6 +145,11 @@ static int write_patterns_and_update(struct pattern_list *pl)
>         char *sparse_filename;
>         FILE *fp;
>
> +       if (!core_apply_sparse_checkout) {
> +               warning(_("core.sparseCheckout is disabled, so changes to the sparse-checkout file will have no effect"));
> +               warning(_("run 'git sparse-checkout init' to enable the sparse-checkout feature"));
> +       }
> +
>         sparse_filename = get_sparse_checkout_filename();
>         fp = fopen(sparse_filename, "w");
>         write_patterns_to_file(fp, pl);
> @@ -154,16 +159,47 @@ static int write_patterns_and_update(struct pattern_list *pl)
>         return update_working_directory();
>  }
>
> +static char const * const builtin_sparse_checkout_set_usage[] = {
> +       N_("git sparse-checkout set [--stdin|<patterns>]"),
> +       NULL
> +};
> +
> +static struct sparse_checkout_set_opts {
> +       int use_stdin;
> +} set_opts;
> +
>  static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
>  {
>         static const char *empty_base = "";
>         int i;
>         struct pattern_list pl;
>         int result;
> +
> +       static struct option builtin_sparse_checkout_set_options[] = {
> +               OPT_BOOL(0, "stdin", &set_opts.use_stdin,
> +                        N_("read patterns from standard in")),
> +               OPT_END(),
> +       };
> +
>         memset(&pl, 0, sizeof(pl));
>
> -       for (i = 1; i < argc; i++)
> -               add_pattern(argv[i], empty_base, 0, &pl, 0);
> +       argc = parse_options(argc, argv, prefix,
> +                            builtin_sparse_checkout_set_options,
> +                            builtin_sparse_checkout_set_usage,
> +                            PARSE_OPT_KEEP_UNKNOWN);

Does this mean users can also spell it 'git sparse-checkout --stdin
set', instead of the expected 'git sparse-checkout set --stdin'?

> +
> +       if (set_opts.use_stdin) {
> +               struct strbuf line = STRBUF_INIT;
> +
> +               while (!strbuf_getline(&line, stdin)) {
> +                       size_t len;
> +                       char *buf = strbuf_detach(&line, &len);
> +                       add_pattern(buf, empty_base, 0, &pl, 0);
> +               }
> +       } else {
> +               for (i = 0; i < argc; i++)
> +                       add_pattern(argv[i], empty_base, 0, &pl, 0);
> +       }
>
>         result = write_patterns_and_update(&pl);
>
> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> index 19e8673c6b..2a0137fde3 100755
> --- a/t/t1091-sparse-checkout-builtin.sh
> +++ b/t/t1091-sparse-checkout-builtin.sh
> @@ -101,6 +101,13 @@ test_expect_success 'clone --sparse' '
>         test_cmp expect dir
>  '
>
> +test_expect_success 'warn if core.sparseCheckout is disabled' '
> +       test_when_finished git -C repo config --worktree core.sparseCheckout true &&
> +       git -C repo config --worktree core.sparseCheckout false &&
> +       git -C repo sparse-checkout set folder1 2>err &&
> +       test_i18ngrep "core.sparseCheckout is disabled" err
> +'
> +
>  test_expect_success 'set sparse-checkout using builtin' '
>         git -C repo sparse-checkout set "/*" "!/*/" "*folder*" &&
>         cat >expect <<-EOF &&
> @@ -120,4 +127,24 @@ test_expect_success 'set sparse-checkout using builtin' '
>         test_cmp expect dir
>  '
>
> +test_expect_success 'set sparse-checkout using --stdin' '
> +       cat >expect <<-EOF &&
> +               /*
> +               !/*/
> +               /folder1/
> +               /folder2/
> +       EOF
> +       git -C repo sparse-checkout set --stdin <expect &&
> +       git -C repo sparse-checkout list >actual &&
> +       test_cmp expect actual &&
> +       test_cmp expect repo/.git/info/sparse-checkout &&
> +       ls repo >dir  &&
> +       cat >expect <<-EOF &&
> +               a
> +               folder1
> +               folder2
> +       EOF
> +       test_cmp expect dir
> +'
> +
>  test_done
> --
> gitgitgadget
>

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 04/17] sparse-checkout: 'set' subcommand
  2019-10-11 22:26       ` Elijah Newren
@ 2019-10-11 22:30         ` Elijah Newren
  0 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-10-11 22:30 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Fri, Oct 11, 2019 at 3:26 PM Elijah Newren <newren@gmail.com> wrote:
>
> On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget

> Looks good, thanks for the fixes.  I'm still slightly worried about
> folks not looking at the docs and calling sparse-checkout set without
> calling init, and then being negatively surprised.  It's a minor
> issue, but a warning might be helpful.

Looks like you added that to patch 5, so nevermind.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 13/17] read-tree: show progress by default
  2019-10-07 20:08     ` [PATCH v3 13/17] read-tree: show progress by default Derrick Stolee via GitGitGadget
@ 2019-10-12 22:16       ` Elijah Newren
  2019-10-14 20:31         ` Derrick Stolee
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-10-12 22:16 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> The read-tree builtin has a --verbose option that signals to show
> progress and other data while updating the index. Update this to
> be on by default when stderr is a terminal window.
>
> This will help tools like 'git sparse-checkout' to automatically
> benefit from progress indicators when a user runs these commands.

This change seems fine, but in patch 2 you said:

> The use of running another process for 'git read-tree' is sub-
> optimal. This will be removed in a later change.

leaving me slightly confused about the goal/plan.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 15/17] sparse-checkout: update working directory in-process
  2019-10-07 20:08     ` [PATCH v3 15/17] sparse-checkout: update working directory in-process Derrick Stolee via GitGitGadget
@ 2019-10-12 22:57       ` Elijah Newren
  2019-10-14 20:39         ` Derrick Stolee
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-10-12 22:57 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> The sparse-checkout builtin used 'git read-tree -mu HEAD' to update the
> skip-worktree bits in the index and to update the working directory.
> This extra process is overly complex, and prone to failure. It also
> requires that we write our changes to the sparse-checkout file before
> trying to update the index.
>
> Remove this extra process call by creating a direct call to
> unpack_trees() in the same way 'git read-tree -mu HEAD' does. In
> adition, provide an in-memory list of patterns so we can avoid

s/adition/addition/

> reading from the sparse-checkout file. This allows us to test a
> proposed change to the file before writing to it.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  builtin/read-tree.c                |  2 +-
>  builtin/sparse-checkout.c          | 85 +++++++++++++++++++++++++-----
>  t/t1091-sparse-checkout-builtin.sh | 17 ++++++
>  unpack-trees.c                     |  5 +-
>  unpack-trees.h                     |  3 +-
>  5 files changed, 95 insertions(+), 17 deletions(-)
>
> diff --git a/builtin/read-tree.c b/builtin/read-tree.c
> index 69963d83dc..d7eeaa26ec 100644
> --- a/builtin/read-tree.c
> +++ b/builtin/read-tree.c
> @@ -186,7 +186,7 @@ int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix)
>
>         if (opts.reset || opts.merge || opts.prefix) {
>                 if (read_cache_unmerged() && (opts.prefix || opts.merge))
> -                       die("You need to resolve your current index first");
> +                       die(_("You need to resolve your current index first"));

A good change, but isn't this unrelated to the current commit?

>                 stage = opts.merge = 1;
>         }
>         resolve_undo_clear();
> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
> index 25786f8bb0..542d57fac6 100644
> --- a/builtin/sparse-checkout.c
> +++ b/builtin/sparse-checkout.c
> @@ -7,6 +7,11 @@
>  #include "run-command.h"
>  #include "strbuf.h"
>  #include "string-list.h"
> +#include "cache.h"
> +#include "cache-tree.h"
> +#include "lockfile.h"
> +#include "resolve-undo.h"
> +#include "unpack-trees.h"
>
>  static char const * const builtin_sparse_checkout_usage[] = {
>         N_("git sparse-checkout [init|list|set|disable] <options>"),
> @@ -60,18 +65,53 @@ static int sparse_checkout_list(int argc, const char **argv)
>         return 0;
>  }
>
> -static int update_working_directory(void)
> +static int update_working_directory(struct pattern_list *pl)
>  {
> -       struct argv_array argv = ARGV_ARRAY_INIT;
>         int result = 0;
> -       argv_array_pushl(&argv, "read-tree", "-m", "-u", "HEAD", NULL);
> +       struct unpack_trees_options o;
> +       struct lock_file lock_file = LOCK_INIT;
> +       struct object_id oid;
> +       struct tree *tree;
> +       struct tree_desc t;
> +       struct repository *r = the_repository;
>
> -       if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
> -               error(_("failed to update index with new sparse-checkout paths"));
> -               result = 1;
> +       if (repo_read_index_unmerged(r))
> +               die(_("You need to resolve your current index first"));

Well, at least that ensures that the user gets a good error message.
I'm not sure I like the error, because e.g. if a user hits a conflict
while merging in a sparse checkout and wants to return to a non-sparse
checkout because they think other files might help them resolve the
conflicts, then they ought to be able to do it.  Basically, unless
they are trying use sparsification to remove entries from the working
directory that differ from the index (and conflicted entries always
differ), then it seems like we should be able to support
sparsification despite the presence of conflicts.

Your series is long enough, doesn't make this problem any worse (and
appears to make it slightly better), and so you really don't need to
tackle that problem in this series. I'm just stating a gripe with
sparse checkouts again.  :-)

[...]

>  static void insert_recursive_pattern(struct pattern_list *pl, struct strbuf *path)
>  {
> -       struct pattern_entry *e = xmalloc(sizeof(struct pattern_entry));
> +       struct pattern_entry *e = xmalloc(sizeof(*e));

This is a good fix, but shouldn't it be squashed into the
"sparse-checkout: init and set in cone mode" commit from earlier in
your series?

> @@ -262,12 +308,21 @@ static int write_patterns_and_update(struct pattern_list *pl)
>  {
>         char *sparse_filename;
>         FILE *fp;
> -
> +       int result;
> +

Trailing whitespace that should be cleaned up.

>         if (!core_apply_sparse_checkout) {
>                 warning(_("core.sparseCheckout is disabled, so changes to the sparse-checkout file will have no effect"));
>                 warning(_("run 'git sparse-checkout init' to enable the sparse-checkout feature"));
>         }
>
> +       result = update_working_directory(pl);
> +
> +       if (result) {
> +               clear_pattern_list(pl);
> +               update_working_directory(NULL);
> +               return result;
> +       }
> +
>         sparse_filename = get_sparse_checkout_filename();
>         fp = fopen(sparse_filename, "w");
>
> @@ -277,9 +332,11 @@ static int write_patterns_and_update(struct pattern_list *pl)
>                 write_patterns_to_file(fp, pl);
>
>         fclose(fp);
> +
>         free(sparse_filename);
> +       clear_pattern_list(pl);
>
> -       return update_working_directory();
> +       return 0;
>  }
>
>  static void strbuf_to_cone_pattern(struct strbuf *line, struct pattern_list *pl)
> @@ -330,6 +387,7 @@ static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
>                 struct strbuf line = STRBUF_INIT;
>                 hashmap_init(&pl.recursive_hashmap, pl_hashmap_cmp, NULL, 0);
>                 hashmap_init(&pl.parent_hashmap, pl_hashmap_cmp, NULL, 0);
> +               pl.use_cone_patterns = 1;
>
>                 if (set_opts.use_stdin) {
>                         while (!strbuf_getline(&line, stdin))
> @@ -375,7 +433,8 @@ static int sparse_checkout_disable(int argc, const char **argv)
>         fprintf(fp, "/*\n");
>         fclose(fp);
>
> -       if (update_working_directory())
> +       core_apply_sparse_checkout = 1;
> +       if (update_working_directory(NULL))
>                 die(_("error while refreshing working directory"));
>
>         unlink(sparse_filename);
> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> index ee4d361787..82eb5fb2f8 100755
> --- a/t/t1091-sparse-checkout-builtin.sh
> +++ b/t/t1091-sparse-checkout-builtin.sh
> @@ -199,11 +199,13 @@ test_expect_success 'cone mode: init and set' '
>                 a
>                 deep
>         EOF
> +       test_cmp dir expect &&
>         ls repo/deep >dir  &&
>         cat >expect <<-EOF &&
>                 a
>                 deeper1
>         EOF
> +       test_cmp dir expect &&
>         ls repo/deep/deeper1 >dir  &&
>         cat >expect <<-EOF &&
>                 a
> @@ -245,4 +247,19 @@ test_expect_success 'cone mode: set with nested folders' '
>         test_cmp repo/.git/info/sparse-checkout expect
>  '
>
> +test_expect_success 'revert to old sparse-checkout on bad update' '
> +       echo update >repo/deep/deeper2/a &&
> +       cp repo/.git/info/sparse-checkout expect &&
> +       test_must_fail git -C repo sparse-checkout set deep/deeper1 2>err &&
> +       test_i18ngrep "Cannot update sparse checkout" err &&
> +       test_cmp repo/.git/info/sparse-checkout expect &&
> +       ls repo/deep >dir &&
> +       cat >expect <<-EOF &&
> +               a
> +               deeper1
> +               deeper2
> +       EOF
> +       test_cmp dir expect
> +'
> +
>  test_done
> diff --git a/unpack-trees.c b/unpack-trees.c
> index edf0fb4673..f0fee5adf2 100644
> --- a/unpack-trees.c
> +++ b/unpack-trees.c
> @@ -1508,7 +1508,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>         memset(&pl, 0, sizeof(pl));
>         if (!core_apply_sparse_checkout || !o->update)
>                 o->skip_sparse_checkout = 1;
> -       if (!o->skip_sparse_checkout) {
> +       if (!o->skip_sparse_checkout && !o->pl) {
>                 char *sparse = git_pathdup("info/sparse-checkout");
>                 pl.use_cone_patterns = core_sparse_checkout_cone;
>                 if (add_patterns_from_file_to_list(sparse, "", 0, &pl, NULL) < 0)
> @@ -1681,7 +1681,8 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>
>  done:
>         trace_performance_leave("unpack_trees");
> -       clear_pattern_list(&pl);
> +       if (!o->keep_pattern_list)
> +               clear_pattern_list(&pl);
>         return ret;
>
>  return_failed:
> diff --git a/unpack-trees.h b/unpack-trees.h
> index f2eee0c7c5..ca94a421a5 100644
> --- a/unpack-trees.h
> +++ b/unpack-trees.h
> @@ -59,7 +59,8 @@ struct unpack_trees_options {
>                      quiet,
>                      exiting_early,
>                      show_all_errors,
> -                    dry_run;
> +                    dry_run,
> +                    keep_pattern_list;
>         const char *prefix;
>         int cache_bottom;
>         struct dir_struct *dir;
> --

The rest looks reasonable.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 16/17] sparse-checkout: write using lockfile
  2019-10-07 20:08     ` [PATCH v3 16/17] sparse-checkout: write using lockfile Derrick Stolee via GitGitGadget
@ 2019-10-12 22:59       ` Elijah Newren
  2019-10-14 20:41         ` Derrick Stolee
  0 siblings, 1 reply; 196+ messages in thread
From: Elijah Newren @ 2019-10-12 22:59 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> If two 'git sparse-checkout set' subcommands are launched at the
> same time, the behavior can be unexpected as they compete to write
> the sparse-checkout file and update the working directory.
>
> Take a lockfile around the writes to the sparse-checkout file. In
> addition, acquire this lock around the working directory update
> to avoid two commands updating the working directory in different
> ways.

Wow, there's something I never would have thought to check.  Did you
have folks run into this, or is this just some defensive programming?
Either way, I'm impressed.

>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  builtin/sparse-checkout.c          | 15 ++++++++++++---
>  t/t1091-sparse-checkout-builtin.sh |  7 +++++++
>  2 files changed, 19 insertions(+), 3 deletions(-)
>
> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
> index 542d57fac6..9b313093cd 100644
> --- a/builtin/sparse-checkout.c
> +++ b/builtin/sparse-checkout.c
> @@ -308,6 +308,8 @@ static int write_patterns_and_update(struct pattern_list *pl)
>  {
>         char *sparse_filename;
>         FILE *fp;
> +       int fd;
> +       struct lock_file lk = LOCK_INIT;
>         int result;
>
>         if (!core_apply_sparse_checkout) {
> @@ -317,21 +319,28 @@ static int write_patterns_and_update(struct pattern_list *pl)
>
>         result = update_working_directory(pl);
>
> +       sparse_filename = get_sparse_checkout_filename();
> +       fd = hold_lock_file_for_update(&lk, sparse_filename,
> +                                     LOCK_DIE_ON_ERROR);
> +
> +       result = update_working_directory(pl);
>         if (result) {
> +               rollback_lock_file(&lk);
> +               free(sparse_filename);
>                 clear_pattern_list(pl);
>                 update_working_directory(NULL);
>                 return result;
>         }
>
> -       sparse_filename = get_sparse_checkout_filename();
> -       fp = fopen(sparse_filename, "w");
> +       fp = fdopen(fd, "w");
>
>         if (core_sparse_checkout_cone)
>                 write_cone_to_file(fp, pl);
>         else
>                 write_patterns_to_file(fp, pl);
>
> -       fclose(fp);
> +       fflush(fp);
> +       commit_lock_file(&lk);
>
>         free(sparse_filename);
>         clear_pattern_list(pl);
> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> index 82eb5fb2f8..f22a4afbea 100755
> --- a/t/t1091-sparse-checkout-builtin.sh
> +++ b/t/t1091-sparse-checkout-builtin.sh
> @@ -262,4 +262,11 @@ test_expect_success 'revert to old sparse-checkout on bad update' '
>         test_cmp dir expect
>  '
>
> +test_expect_success 'fail when lock is taken' '
> +       test_when_finished rm -rf repo/.git/info/sparse-checkout.lock &&
> +       touch repo/.git/info/sparse-checkout.lock &&
> +       test_must_fail git -C repo sparse-checkout set deep 2>err &&
> +       test_i18ngrep "File exists" err
> +'
> +
>  test_done
> --
> gitgitgadget
>

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 17/17] sparse-checkout: cone mode should not interact with .gitignore
  2019-10-07 20:08     ` [PATCH v3 17/17] sparse-checkout: cone mode should not interact with .gitignore Derrick Stolee via GitGitGadget
@ 2019-10-12 23:00       ` Elijah Newren
  0 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-10-12 23:00 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> During the development of the sparse-checkout "cone mode" feature,
> an incorrect placement of the initializer for "use_cone_patterns = 1"
> caused warnings to show up when a .gitignore file was present with
> non-cone-mode patterns. This was fixed in the original commit
> introducing the cone mode, but now we should add a test to avoid
> hitting this problem again in the future.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  t/t1091-sparse-checkout-builtin.sh | 7 +++++++
>  1 file changed, 7 insertions(+)
>
> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> index f22a4afbea..ed9355384a 100755
> --- a/t/t1091-sparse-checkout-builtin.sh
> +++ b/t/t1091-sparse-checkout-builtin.sh
> @@ -269,4 +269,11 @@ test_expect_success 'fail when lock is taken' '
>         test_i18ngrep "File exists" err
>  '
>
> +test_expect_success '.gitignore should not warn about cone mode' '
> +       git -C repo config --worktree core.sparseCheckoutCone true &&
> +       echo "**/bin/*" >repo/.gitignore &&
> +       git -C repo reset --hard 2>err &&
> +       test_i18ngrep ! "disabling cone patterns" err
> +'
> +
>  test_done
> --

Makes sense; thanks for adding good preventative tests.

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 00/17] New sparse-checkout builtin and "cone" mode
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (16 preceding siblings ...)
  2019-10-07 20:08     ` [PATCH v3 17/17] sparse-checkout: cone mode should not interact with .gitignore Derrick Stolee via GitGitGadget
@ 2019-10-12 23:22     ` Elijah Newren
  2019-10-15 13:55     ` [PATCH v4 " Derrick Stolee via GitGitGadget
  18 siblings, 0 replies; 196+ messages in thread
From: Elijah Newren @ 2019-10-12 23:22 UTC (permalink / raw)
  To: Derrick Stolee via GitGitGadget; +Cc: Git Mailing List, Junio C Hamano

On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> This series makes the sparse-checkout feature more user-friendly. While
> there, I also present a way to use a limited set of patterns to gain a
> significant performance boost in very large repositories.
>
> Sparse-checkout is only documented as a subsection of the read-tree docs
> [1], which makes the feature hard to discover. Users have trouble navigating
> the feature, especially at clone time [2], and have even resorted to
> creating their own helper tools [3].
>
> This series attempts to solve these problems using a new builtin. Here is a
> sample workflow to give a feeling for how it can work:
>
> In an existing repo:
>
> $ git sparse-checkout init
> $ ls
> myFile1.txt myFile2.txt
> $ git sparse-checkout set "/*" "!/*/" /myFolder/
> $ ls
> myFile1.txt myFile2.txt myFolder
> $ ls myFolder
> a.c a.h
> $ git sparse-checkout disable
> $ ls
> hiddenFolder myFile1.txt myFile2.txt myFolder
>
> At clone time:
>
> $ git clone --sparse origin repo
> $ cd repo
> $ ls
> myFile1.txt myFile2.txt
> $ git sparse-checkout set "/*" "!/*/" /myFolder/
> $ ls
> myFile1.txt myFile2.txt myFolder
>
> Here are some more specific details:
>
>  * git sparse-checkout init enables core.sparseCheckout and populates the
>    sparse-checkout file with patterns that match only the files at root.
>
>
>  * git clone learns the --sparse argument to run git sparse-checkout init
>    before the first checkout.
>
>
>  * git sparse-checkout set reads patterns from the arguments, or with
>    --stdin reads patterns from stdin one per line, then writes them to the
>    sparse-checkout file and refreshes the working directory.
>
>
>  * git sparse-checkout disable removes the patterns from the sparse-checkout
>    file, disables core.sparseCheckout, and refills the working directory.
>
>
>  * git sparse-checkout list lists the contents of the sparse-checkout file.
>
>
>
> The documentation for the sparse-checkout feature can now live primarily
> with the git-sparse-checkout documentation.
>
> Cone Mode
> =========
>
> What really got me interested in this area is a performance problem. If we
> have N patterns in the sparse-checkout file and M entries in the index, then
> we can perform up to O(N * M) pattern checks in clear_ce_flags(). This
> quadratic growth is not sustainable in a repo with 1,000+ patterns and
> 1,000,000+ index entries.
>
> To solve this problem, I propose a new, more restrictive mode to
> sparse-checkout: "cone mode". In this mode, all patterns are based on prefix
> matches at a directory level. This can then use hashsets for fast
> performance -- O(M) instead of O(N*M). My hashset implementation is based on
> the virtual filesystem hook in the VFS for Git custom code [4].
>
> In cone mode, a user specifies a list of folders which the user wants every
> file inside. In addition, the cone adds all blobs that are siblings of the
> folders in the directory path to that folder. This makes the directories
> look "hydrated" as a user drills down to those recursively-closed folders.
> These directories are called "parent" folders, as a file matches them only
> if the file's immediate parent is that directory.
>
> When building a prototype of this feature, I used a separate file to contain
> the list of recursively-closed folders and built the hashsets dynamically
> based on that file. In this implementation, I tried to maximize the amount
> of backwards-compatibility by storing all data in the sparse-checkout file
> using patterns recognized by earlier Git versions.
>
> For example, if we add A/B/C as a recursive folder, then we add the
> following patterns to the sparse-checkout file:
>
> /*
> !/*/
> /A/
> !/A/*/
> /A/B/
> !/A/B/*/
> /A/B/C/
>
> The alternating positive/negative patterns say "include everything in this
> folder, but exclude everything another level deeper". The final pattern has
> no matching negation, so is a recursively closed pattern.
>
> Note that I have some basic warnings to try and check that the
> sparse-checkout file doesn't match what would be written by a cone-mode add.
> In such a case, Git writes a warning to stderr and continues with the old
> pattern matching algorithm. These checks are currently very barebones, and
> would need to be updated with more robust checks for things like regex
> characters in the middle of the pattern. As review moves forward (and if we
> don't change the data storage) then we could spend more time on this.
>
> Thanks, -Stolee
>
> Updates in v2, relative to the RFC:
>
>  * Instead of an 'add' subcommand, use a 'set' subcommand. We can consider
>    adding 'add' and/or 'remove' subcommands later.
>
>
>  * 'set' reads from the arguments by default. '--stdin' option is available.
>
>
>  * A new performance-oriented commit is added at the end.
>
>
>  * Patterns no longer end with a trailing asterisk except for the first "/*"
>    pattern.
>
>
>  * References to a "bug" (that was really a strange GVFS interaction in
>    microsoft/git) around deleting outside the cone are removed.
>
>
>
> Updates in v3:
>
>  * The bad interaction with "cone mode" and .gitignore files is fixed. A
>    test is added in the last patch.
>
>
>  * Several patches are added that make the feature more robust. One
>    sanitizes user input, another few add progress indicators, and another
>    more prevent users from getting in bad states due to working directory
>    changes or concurrent processes.
>
>
>  * Updated several docs and commit messages according to feedback. Thanks,
>    Elijah!
>

I've read through v3, and at this point I'm mostly just finding small
stuff to comment on; the patches are looking pretty good.  One bigger
item I'd like to comment on, is that I'd really like this feature to
be explicitly labelled as experimental as per [1].  Not just because
of edge and corner case bugs (at least one of which we know about and
I've drummed on a few times), but much more importantly because I
believe grep, log, diff, and other commands should by default pay
attention to the sparsity patterns to limit their output for the user
both because that's the stuff the user is interested in and the files
outside those paths are at best noise, and because it provides them an
ability to get the responsiveness of a small repository even while
working in a bigger one.  See [2] for more details.  If we don't mark
this command as experimental, I'm worried people may start coming to
expect whatever behavior they get with sparse checkouts and then we'll
be locked in due to backward compatibility, and users who want a good
experience will have to know to set a whole bunch of flags instead of
getting the right behavior by default.

[1] https://public-inbox.org/git/b1444dab-24e5-6e4d-bea8-37abc433b546@gmail.com/
[2] https://public-inbox.org/git/CABPp-BGuFhDwWZBRaD3nA8ui46wor-4=Ha1G1oApsfF8KNpfGQ@mail.gmail.com/

> Things to leave for future patches:
>
>  1. Integrate in 'git worktree add' to copy the sparse-checkout file to a
>     worktree-specific file.

I'm a big fan of this, but it sounds like Junio isn't[2].  I tried to
follow up to get more details, but I'm worried his view might mean
non-sparse-worktrees-by-default, which in turn I'm worried will be
unworkable with sufficiently large repos and/or not be very friendly
to future users of both partial clones and sparse checkouts.

[2] https://public-inbox.org/git/xmqqy2y3ejwe.fsf@gitster-ct.c.googlers.com/

>  2. More robustness around detecting non-cone patterns with wildcards in the
>     middle of the line.

Make sense.

>  3. 'git clone --sparse-cone' to clone into "cone mode" sparse-checkouts
>     (i.e. set 'core.sparseCheckoutCone=true'). This may not be
>     super-valuable, as it only starts changing behavior when someone calls
>     'git sparse-checkout set', but may be interesting.

It'd probably be nice for users, who would otherwise need to run:
  git clone --sparse ...
  git sparse-checkout init --cone
  git sparse-checkout set ...

The need to run init after doing a clone --sparse could possibly be
seen as annoying.  OR.... maybe those users will just set
core.sparseCheckoutCone=true in their global gitconfig, if the
documentation points them to it or they think about it.



Thanks for all the work on this series!  (And sorry my reviews always
take a while...)

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 02/17] sparse-checkout: create 'init' subcommand
  2019-10-11 22:14       ` Elijah Newren
@ 2019-10-14 20:22         ` Derrick Stolee
  0 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-10-14 20:22 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On 10/11/2019 6:14 PM, Elijah Newren wrote:
> On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>> ++
>> +The init subcommand also enables the 'extensions.worktreeConfig' setting
>> +and sets the `core.sparseCheckout` setting in the worktree-specific config
>> +file. This prevents the sparse-checkout feature from interfering with other
>> +worktrees.
> 
> I'm afraid that might be mis-parsed by future readers.  Perhaps something like:
> 
> The init subcommand also enables the `core.sparseCheckout` setting.

I like the paragraph below, but the sentence above is repeated from
the earlier paragraph.

> To avoid interfering with other worktrees, it first enables the
> `extensions.worktreeConfig` setting and makes sure to set the
> `core.sparseCheckout` setting in the worktree-specific config file.
> 
>> +enum sparse_checkout_mode {
>> +       MODE_NONE = 0,
>> +       MODE_FULL = 1,
>> +};
> 
> So MODE_FULL is "true" and MODE_NONE is "false".  MODE_NONE seems
> confusing to me, but let's keep reading...
> 
>> +
>> +static int sc_set_config(enum sparse_checkout_mode mode)
>> +{
>> +       struct argv_array argv = ARGV_ARRAY_INIT;
>> +
>> +       if (git_config_set_gently("extensions.worktreeConfig", "true")) {
>> +               error(_("failed to set extensions.worktreeConfig setting"));
>> +               return 1;
>> +       }
>> +
>> +       argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", NULL);
>> +
>> +       if (mode)
>> +               argv_array_pushl(&argv, "true", NULL);
>> +       else
>> +               argv_array_pushl(&argv, "false", NULL);
> 
> Wait, what?  MODE_FULL is used to specify that you want a sparse
> checkout, and MODE_NONE is used to denote that you want a full (i.e.
> non-sparse) checkout?  These are *very* confusing names.

I understand they are confusing, hopefully it makes more sense with
the cone mode later.

* NONE == "No patterns at all"
* FULL == "all patterns allowed"
* CONE == "only cone patterns" (appears later)

Since this is just an internal detail, what if I switched it to

* MODE_NO_PATTERNS
* MODE_ALL_PATTERNS
* MODE_CONE_PATTERNS

Would that make more sense?

>> +static int sparse_checkout_init(int argc, const char **argv)
>> +{
>> +       struct pattern_list pl;
>> +       char *sparse_filename;
>> +       FILE *fp;
>> +       int res;
>> +
>> +       if (sc_set_config(MODE_FULL))
>> +               return 1;
> 
> Seems confusing here too.
> 
> 
> Everything else in the patch looks good, though.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 05/17] sparse-checkout: add '--stdin' option to set subcommand
  2019-10-11 22:27       ` Elijah Newren
@ 2019-10-14 20:28         ` Derrick Stolee
  0 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-10-14 20:28 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On 10/11/2019 6:27 PM, Elijah Newren wrote:
> On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> The 'git sparse-checkout set' subcommand takes a list of patterns
>> and places them in the sparse-checkout file. Then, it updates the
>> working directory to match those patterns. For a large list of
>> patterns, the command-line call can get very cumbersome.
>>
>> Add a '--stdin' option to instead read patterns over standard in.
>>
>> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
>> ---
>>  builtin/sparse-checkout.c          | 40 ++++++++++++++++++++++++++++--
>>  t/t1091-sparse-checkout-builtin.sh | 27 ++++++++++++++++++++
>>  2 files changed, 65 insertions(+), 2 deletions(-)
>>
>> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
>> index 52d4f832f3..68f3d8433e 100644
>> --- a/builtin/sparse-checkout.c
>> +++ b/builtin/sparse-checkout.c
>> @@ -145,6 +145,11 @@ static int write_patterns_and_update(struct pattern_list *pl)
>>         char *sparse_filename;
>>         FILE *fp;
>>
>> +       if (!core_apply_sparse_checkout) {
>> +               warning(_("core.sparseCheckout is disabled, so changes to the sparse-checkout file will have no effect"));
>> +               warning(_("run 'git sparse-checkout init' to enable the sparse-checkout feature"));
>> +       }
>> +
>>         sparse_filename = get_sparse_checkout_filename();
>>         fp = fopen(sparse_filename, "w");
>>         write_patterns_to_file(fp, pl);
>> @@ -154,16 +159,47 @@ static int write_patterns_and_update(struct pattern_list *pl)
>>         return update_working_directory();
>>  }
>>
>> +static char const * const builtin_sparse_checkout_set_usage[] = {
>> +       N_("git sparse-checkout set [--stdin|<patterns>]"),
>> +       NULL
>> +};
>> +
>> +static struct sparse_checkout_set_opts {
>> +       int use_stdin;
>> +} set_opts;
>> +
>>  static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
>>  {
>>         static const char *empty_base = "";
>>         int i;
>>         struct pattern_list pl;
>>         int result;
>> +
>> +       static struct option builtin_sparse_checkout_set_options[] = {
>> +               OPT_BOOL(0, "stdin", &set_opts.use_stdin,
>> +                        N_("read patterns from standard in")),
>> +               OPT_END(),
>> +       };
>> +
>>         memset(&pl, 0, sizeof(pl));
>>
>> -       for (i = 1; i < argc; i++)
>> -               add_pattern(argv[i], empty_base, 0, &pl, 0);
>> +       argc = parse_options(argc, argv, prefix,
>> +                            builtin_sparse_checkout_set_options,
>> +                            builtin_sparse_checkout_set_usage,
>> +                            PARSE_OPT_KEEP_UNKNOWN);
> 
> Does this mean users can also spell it 'git sparse-checkout --stdin
> set', instead of the expected 'git sparse-checkout set --stdin'?

No, because the parse_options() inside cmd_sparse_checkout() parses until
it doesn't recognize an option. ('stdin' in your example). After we "consume"
the subcommand "set", we call this method and the parse_options() can then
read the '--stdin'.

Here is the output from my local command of 'git sparse-checkout --stdin set'
at this commit:

	$ ./git sparse-checkout --stdin set
	error: unknown option `stdin'
	usage: git sparse-checkout [init|list|set] <options>

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 13/17] read-tree: show progress by default
  2019-10-12 22:16       ` Elijah Newren
@ 2019-10-14 20:31         ` Derrick Stolee
  0 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-10-14 20:31 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On 10/12/2019 6:16 PM, Elijah Newren wrote:
> On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> The read-tree builtin has a --verbose option that signals to show
>> progress and other data while updating the index. Update this to
>> be on by default when stderr is a terminal window.
>>
>> This will help tools like 'git sparse-checkout' to automatically
>> benefit from progress indicators when a user runs these commands.
> 
> This change seems fine, but in patch 2 you said:
> 
>> The use of running another process for 'git read-tree' is sub-
>> optimal. This will be removed in a later change.
> 
> leaving me slightly confused about the goal/plan.

True, this is not necessary for the whole series. I created this
patch as a way to show progress in our microsoft/git fork [1], then
removed the read-tree call in a later change [2]. When preparing v3,
I took all of the changes together.

I thought this was valuable on its own, for those users who are
using the old mechanisms for sparse-checkout updates.

Thanks,
-Stolee

[1] https://github.com/microsoft/git/pull/200

[2] https://github.com/microsoft/git/pull/204

^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 15/17] sparse-checkout: update working directory in-process
  2019-10-12 22:57       ` Elijah Newren
@ 2019-10-14 20:39         ` Derrick Stolee
  0 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-10-14 20:39 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee

On 10/12/2019 6:57 PM, Elijah Newren wrote:
> On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> The sparse-checkout builtin used 'git read-tree -mu HEAD' to update the
>> skip-worktree bits in the index and to update the working directory.
>> This extra process is overly complex, and prone to failure. It also
>> requires that we write our changes to the sparse-checkout file before
>> trying to update the index.
>>
>> Remove this extra process call by creating a direct call to
>> unpack_trees() in the same way 'git read-tree -mu HEAD' does. In
>> adition, provide an in-memory list of patterns so we can avoid
> 
> s/adition/addition/
> 
>> reading from the sparse-checkout file. This allows us to test a
>> proposed change to the file before writing to it.
>>
>> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
>> ---
>>  builtin/read-tree.c                |  2 +-
>>  builtin/sparse-checkout.c          | 85 +++++++++++++++++++++++++-----
>>  t/t1091-sparse-checkout-builtin.sh | 17 ++++++
>>  unpack-trees.c                     |  5 +-
>>  unpack-trees.h                     |  3 +-
>>  5 files changed, 95 insertions(+), 17 deletions(-)
>>
>> diff --git a/builtin/read-tree.c b/builtin/read-tree.c
>> index 69963d83dc..d7eeaa26ec 100644
>> --- a/builtin/read-tree.c
>> +++ b/builtin/read-tree.c
>> @@ -186,7 +186,7 @@ int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix)
>>
>>         if (opts.reset || opts.merge || opts.prefix) {
>>                 if (read_cache_unmerged() && (opts.prefix || opts.merge))
>> -                       die("You need to resolve your current index first");
>> +                       die(_("You need to resolve your current index first"));
> 
> A good change, but isn't this unrelated to the current commit?

It's related because I'm repeating the error in the sparse-checkout builtin, but
it should be localized in both places.

>>                 stage = opts.merge = 1;
>>         }
>>         resolve_undo_clear();
>> diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
>> index 25786f8bb0..542d57fac6 100644
>> --- a/builtin/sparse-checkout.c
>> +++ b/builtin/sparse-checkout.c
>> @@ -7,6 +7,11 @@
>>  #include "run-command.h"
>>  #include "strbuf.h"
>>  #include "string-list.h"
>> +#include "cache.h"
>> +#include "cache-tree.h"
>> +#include "lockfile.h"
>> +#include "resolve-undo.h"
>> +#include "unpack-trees.h"
>>
>>  static char const * const builtin_sparse_checkout_usage[] = {
>>         N_("git sparse-checkout [init|list|set|disable] <options>"),
>> @@ -60,18 +65,53 @@ static int sparse_checkout_list(int argc, const char **argv)
>>         return 0;
>>  }
>>
>> -static int update_working_directory(void)
>> +static int update_working_directory(struct pattern_list *pl)
>>  {
>> -       struct argv_array argv = ARGV_ARRAY_INIT;
>>         int result = 0;
>> -       argv_array_pushl(&argv, "read-tree", "-m", "-u", "HEAD", NULL);
>> +       struct unpack_trees_options o;
>> +       struct lock_file lock_file = LOCK_INIT;
>> +       struct object_id oid;
>> +       struct tree *tree;
>> +       struct tree_desc t;
>> +       struct repository *r = the_repository;
>>
>> -       if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
>> -               error(_("failed to update index with new sparse-checkout paths"));
>> -               result = 1;
>> +       if (repo_read_index_unmerged(r))
>> +               die(_("You need to resolve your current index first"));
> 
> Well, at least that ensures that the user gets a good error message.
> I'm not sure I like the error, because e.g. if a user hits a conflict
> while merging in a sparse checkout and wants to return to a non-sparse
> checkout because they think other files might help them resolve the
> conflicts, then they ought to be able to do it.  Basically, unless
> they are trying use sparsification to remove entries from the working
> directory that differ from the index (and conflicted entries always
> differ), then it seems like we should be able to support
> sparsification despite the presence of conflicts.
> 
> Your series is long enough, doesn't make this problem any worse (and
> appears to make it slightly better), and so you really don't need to
> tackle that problem in this series. I'm just stating a gripe with
> sparse checkouts again.  :-)

Absolutely, we should revisit the entire feature and how it handles these
conflicts in the best possible ways. As far as I can see, the only way these
conflicts arise is if the user creates conflicting files _outside_ their
sparse cone and then expand their cone. Finding all the strange cases
will require experimentation.
 
> [...]
> 
>>  static void insert_recursive_pattern(struct pattern_list *pl, struct strbuf *path)
>>  {
>> -       struct pattern_entry *e = xmalloc(sizeof(struct pattern_entry));
>> +       struct pattern_entry *e = xmalloc(sizeof(*e));
> 
> This is a good fix, but shouldn't it be squashed into the
> "sparse-checkout: init and set in cone mode" commit from earlier in
> your series?

Yeah, I think I mis-applied a few fixups to this commit instead of an earlier one.

>> @@ -262,12 +308,21 @@ static int write_patterns_and_update(struct pattern_list *pl)
>>  {
>>         char *sparse_filename;
>>         FILE *fp;
>> -
>> +       int result;
>> +
> 
> Trailing whitespace that should be cleaned up.

Thanks. Will do.

> 
>>         if (!core_apply_sparse_checkout) {
>>                 warning(_("core.sparseCheckout is disabled, so changes to the sparse-checkout file will have no effect"));
>>                 warning(_("run 'git sparse-checkout init' to enable the sparse-checkout feature"));
>>         }
>>
>> +       result = update_working_directory(pl);
>> +
>> +       if (result) {
>> +               clear_pattern_list(pl);
>> +               update_working_directory(NULL);
>> +               return result;
>> +       }
>> +
>>         sparse_filename = get_sparse_checkout_filename();
>>         fp = fopen(sparse_filename, "w");
>>
>> @@ -277,9 +332,11 @@ static int write_patterns_and_update(struct pattern_list *pl)
>>                 write_patterns_to_file(fp, pl);
>>
>>         fclose(fp);
>> +
>>         free(sparse_filename);
>> +       clear_pattern_list(pl);
>>
>> -       return update_working_directory();
>> +       return 0;
>>  }
>>
>>  static void strbuf_to_cone_pattern(struct strbuf *line, struct pattern_list *pl)
>> @@ -330,6 +387,7 @@ static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
>>                 struct strbuf line = STRBUF_INIT;
>>                 hashmap_init(&pl.recursive_hashmap, pl_hashmap_cmp, NULL, 0);
>>                 hashmap_init(&pl.parent_hashmap, pl_hashmap_cmp, NULL, 0);
>> +               pl.use_cone_patterns = 1;
>>
>>                 if (set_opts.use_stdin) {
>>                         while (!strbuf_getline(&line, stdin))
>> @@ -375,7 +433,8 @@ static int sparse_checkout_disable(int argc, const char **argv)
>>         fprintf(fp, "/*\n");
>>         fclose(fp);
>>
>> -       if (update_working_directory())
>> +       core_apply_sparse_checkout = 1;
>> +       if (update_working_directory(NULL))
>>                 die(_("error while refreshing working directory"));
>>
>>         unlink(sparse_filename);
>> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
>> index ee4d361787..82eb5fb2f8 100755
>> --- a/t/t1091-sparse-checkout-builtin.sh
>> +++ b/t/t1091-sparse-checkout-builtin.sh
>> @@ -199,11 +199,13 @@ test_expect_success 'cone mode: init and set' '
>>                 a
>>                 deep
>>         EOF
>> +       test_cmp dir expect &&
>>         ls repo/deep >dir  &&
>>         cat >expect <<-EOF &&
>>                 a
>>                 deeper1
>>         EOF
>> +       test_cmp dir expect &&
>>         ls repo/deep/deeper1 >dir  &&
>>         cat >expect <<-EOF &&
>>                 a
>> @@ -245,4 +247,19 @@ test_expect_success 'cone mode: set with nested folders' '
>>         test_cmp repo/.git/info/sparse-checkout expect
>>  '
>>
>> +test_expect_success 'revert to old sparse-checkout on bad update' '
>> +       echo update >repo/deep/deeper2/a &&
>> +       cp repo/.git/info/sparse-checkout expect &&
>> +       test_must_fail git -C repo sparse-checkout set deep/deeper1 2>err &&
>> +       test_i18ngrep "Cannot update sparse checkout" err &&
>> +       test_cmp repo/.git/info/sparse-checkout expect &&
>> +       ls repo/deep >dir &&
>> +       cat >expect <<-EOF &&
>> +               a
>> +               deeper1
>> +               deeper2
>> +       EOF
>> +       test_cmp dir expect
>> +'
>> +
>>  test_done
>> diff --git a/unpack-trees.c b/unpack-trees.c
>> index edf0fb4673..f0fee5adf2 100644
>> --- a/unpack-trees.c
>> +++ b/unpack-trees.c
>> @@ -1508,7 +1508,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>>         memset(&pl, 0, sizeof(pl));
>>         if (!core_apply_sparse_checkout || !o->update)
>>                 o->skip_sparse_checkout = 1;
>> -       if (!o->skip_sparse_checkout) {
>> +       if (!o->skip_sparse_checkout && !o->pl) {
>>                 char *sparse = git_pathdup("info/sparse-checkout");
>>                 pl.use_cone_patterns = core_sparse_checkout_cone;
>>                 if (add_patterns_from_file_to_list(sparse, "", 0, &pl, NULL) < 0)
>> @@ -1681,7 +1681,8 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>>
>>  done:
>>         trace_performance_leave("unpack_trees");
>> -       clear_pattern_list(&pl);
>> +       if (!o->keep_pattern_list)
>> +               clear_pattern_list(&pl);
>>         return ret;
>>
>>  return_failed:
>> diff --git a/unpack-trees.h b/unpack-trees.h
>> index f2eee0c7c5..ca94a421a5 100644
>> --- a/unpack-trees.h
>> +++ b/unpack-trees.h
>> @@ -59,7 +59,8 @@ struct unpack_trees_options {
>>                      quiet,
>>                      exiting_early,
>>                      show_all_errors,
>> -                    dry_run;
>> +                    dry_run,
>> +                    keep_pattern_list;
>>         const char *prefix;
>>         int cache_bottom;
>>         struct dir_struct *dir;
>> --
> 
> The rest looks reasonable.
> 


^ permalink raw reply	[flat|nested] 196+ messages in thread

* Re: [PATCH v3 16/17] sparse-checkout: write using lockfile
  2019-10-12 22:59       ` Elijah Newren
@ 2019-10-14 20:41         ` Derrick Stolee
  0 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee @ 2019-10-14 20:41 UTC (permalink / raw)
  To: Elijah Newren, Derrick Stolee via GitGitGadget
  Cc: Git Mailing List, Junio C Hamano, Derrick Stolee, Kevin Willford

On 10/12/2019 6:59 PM, Elijah Newren wrote:
> On Mon, Oct 7, 2019 at 1:08 PM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> If two 'git sparse-checkout set' subcommands are launched at the
>> same time, the behavior can be unexpected as they compete to write
>> the sparse-checkout file and update the working directory.
>>
>> Take a lockfile around the writes to the sparse-checkout file. In
>> addition, acquire this lock around the working directory update
>> to avoid two commands updating the working directory in different
>> ways.
> 
> Wow, there's something I never would have thought to check.  Did you
> have folks run into this, or is this just some defensive programming?
> Either way, I'm impressed.

This is defensive programming thanks to Kevin Willford's careful
review [1].

-Stolee

[1] https://github.com/microsoft/git/pull/204#discussion_r330252848


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v4 00/17] New sparse-checkout builtin and "cone" mode
  2019-10-07 20:08   ` [PATCH v3 00/17] " Derrick Stolee via GitGitGadget
                       ` (17 preceding siblings ...)
  2019-10-12 23:22     ` [PATCH v3 00/17] New sparse-checkout builtin and "cone" mode Elijah Newren
@ 2019-10-15 13:55     ` " Derrick Stolee via GitGitGadget
  2019-10-15 13:55       ` [PATCH v4 01/17] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
                         ` (19 more replies)
  18 siblings, 20 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-15 13:55 UTC (permalink / raw)
  To: git; +Cc: newren, Derrick Stolee, Junio C Hamano

V4 UPDATE: Rebased on latest master to include ew/hashmap and
ds/include-exclude in the base.

This series makes the sparse-checkout feature more user-friendly. While
there, I also present a way to use a limited set of patterns to gain a
significant performance boost in very large repositories.

Sparse-checkout is only documented as a subsection of the read-tree docs
[1], which makes the feature hard to discover. Users have trouble navigating
the feature, especially at clone time [2], and have even resorted to
creating their own helper tools [3].

This series attempts to solve these problems using a new builtin. Here is a
sample workflow to give a feeling for how it can work:

In an existing repo:

$ git sparse-checkout init
$ ls
myFile1.txt myFile2.txt
$ git sparse-checkout set "/*" "!/*/" /myFolder/
$ ls
myFile1.txt myFile2.txt myFolder
$ ls myFolder
a.c a.h
$ git sparse-checkout disable
$ ls
hiddenFolder myFile1.txt myFile2.txt myFolder

At clone time:

$ git clone --sparse origin repo
$ cd repo
$ ls
myFile1.txt myFile2.txt
$ git sparse-checkout set "/*" "!/*/" /myFolder/
$ ls
myFile1.txt myFile2.txt myFolder

Here are some more specific details:

 * git sparse-checkout init enables core.sparseCheckout and populates the
   sparse-checkout file with patterns that match only the files at root.
   
   
 * git clone learns the --sparse argument to run git sparse-checkout init 
   before the first checkout.
   
   
 * git sparse-checkout set reads patterns from the arguments, or with
   --stdin reads patterns from stdin one per line, then writes them to the
   sparse-checkout file and refreshes the working directory.
   
   
 * git sparse-checkout disable removes the patterns from the sparse-checkout
   file, disables core.sparseCheckout, and refills the working directory.
   
   
 * git sparse-checkout list lists the contents of the sparse-checkout file.
   
   

The documentation for the sparse-checkout feature can now live primarily
with the git-sparse-checkout documentation.

Cone Mode
=========

What really got me interested in this area is a performance problem. If we
have N patterns in the sparse-checkout file and M entries in the index, then
we can perform up to O(N * M) pattern checks in clear_ce_flags(). This
quadratic growth is not sustainable in a repo with 1,000+ patterns and
1,000,000+ index entries.

To solve this problem, I propose a new, more restrictive mode to
sparse-checkout: "cone mode". In this mode, all patterns are based on prefix
matches at a directory level. This can then use hashsets for fast
performance -- O(M) instead of O(N*M). My hashset implementation is based on
the virtual filesystem hook in the VFS for Git custom code [4].

In cone mode, a user specifies a list of folders which the user wants every
file inside. In addition, the cone adds all blobs that are siblings of the
folders in the directory path to that folder. This makes the directories
look "hydrated" as a user drills down to those recursively-closed folders.
These directories are called "parent" folders, as a file matches them only
if the file's immediate parent is that directory.

When building a prototype of this feature, I used a separate file to contain
the list of recursively-closed folders and built the hashsets dynamically
based on that file. In this implementation, I tried to maximize the amount
of backwards-compatibility by storing all data in the sparse-checkout file
using patterns recognized by earlier Git versions.

For example, if we add A/B/C as a recursive folder, then we add the
following patterns to the sparse-checkout file:

/*
!/*/
/A/
!/A/*/
/A/B/
!/A/B/*/
/A/B/C/

The alternating positive/negative patterns say "include everything in this
folder, but exclude everything another level deeper". The final pattern has
no matching negation, so is a recursively closed pattern.

Note that I have some basic warnings to try and check that the
sparse-checkout file doesn't match what would be written by a cone-mode add.
In such a case, Git writes a warning to stderr and continues with the old
pattern matching algorithm. These checks are currently very barebones, and
would need to be updated with more robust checks for things like regex
characters in the middle of the pattern. As review moves forward (and if we
don't change the data storage) then we could spend more time on this.

Thanks, -Stolee

Updates in v2, relative to the RFC:

 * Instead of an 'add' subcommand, use a 'set' subcommand. We can consider
   adding 'add' and/or 'remove' subcommands later.
   
   
 * 'set' reads from the arguments by default. '--stdin' option is available.
   
   
 * A new performance-oriented commit is added at the end.
   
   
 * Patterns no longer end with a trailing asterisk except for the first "/*"
   pattern.
   
   
 * References to a "bug" (that was really a strange GVFS interaction in
   microsoft/git) around deleting outside the cone are removed.
   
   

Updates in v3:

 * The bad interaction with "cone mode" and .gitignore files is fixed. A
   test is added in the last patch.
   
   
 * Several patches are added that make the feature more robust. One
   sanitizes user input, another few add progress indicators, and another
   more prevent users from getting in bad states due to working directory
   changes or concurrent processes.
   
   
 * Updated several docs and commit messages according to feedback. Thanks,
   Elijah!
   
   

Updates in V4:

 * Updated hashmap API usage to respond to ew/hashmap
   
   
 * Responded to detailed review by Elijah. Thanks!
   
   
 * Marked the feature as experimental in git-sparse-checkout.txt the same
   way that git-switch.txt does.
   
   

Things to leave for future patches:

 1. Integrate in 'git worktree add' to copy the sparse-checkout file to a
    worktree-specific file.
    
    
 2. More robustness around detecting non-cone patterns with wildcards in the
    middle of the line.
    
    
 3. 'git clone --sparse-cone' to clone into "cone mode" sparse-checkouts
    (i.e. set 'core.sparseCheckoutCone=true'). This may not be
    super-valuable, as it only starts changing behavior when someone calls
    'git sparse-checkout set', but may be interesting.
    
    

[1] https://git-scm.com/docs/git-read-tree#_sparse_checkoutSparse-checkout
documentation in git-read-tree.

[2] https://stackoverflow.com/a/4909267/127088Is it possible to do a sparse
checkout without checking out the whole repository first?

[3] http://www.marcoyuen.com/articles/2016/06/07/git-sparse.htmlA blog post
of a user's extra "git-sparse" helper.

[4] 
https://github.com/git/git/compare/fc5fd706ff733392053e6180086a4d7f96acc2af...01204f24c5349aa2fb0c474546d768946d315dab
The virtual filesystem hook in microsoft/git.

Derrick Stolee (16):
  sparse-checkout: create builtin with 'list' subcommand
  sparse-checkout: create 'init' subcommand
  clone: add --sparse mode
  sparse-checkout: 'set' subcommand
  sparse-checkout: add '--stdin' option to set subcommand
  sparse-checkout: create 'disable' subcommand
  sparse-checkout: add 'cone' mode
  sparse-checkout: use hashmaps for cone patterns
  sparse-checkout: init and set in cone mode
  unpack-trees: hash less in cone mode
  unpack-trees: add progress to clear_ce_flags()
  read-tree: show progress by default
  sparse-checkout: sanitize for nested folders
  sparse-checkout: update working directory in-process
  sparse-checkout: write using lockfile
  sparse-checkout: cone mode should not interact with .gitignore

Jeff Hostetler (1):
  trace2: add region in clear_ce_flags

 .gitignore                            |   1 +
 Documentation/config/core.txt         |  10 +-
 Documentation/git-clone.txt           |   8 +-
 Documentation/git-read-tree.txt       |   2 +-
 Documentation/git-sparse-checkout.txt | 150 ++++++++
 Makefile                              |   1 +
 builtin.h                             |   1 +
 builtin/clone.c                       |  27 ++
 builtin/read-tree.c                   |   3 +-
 builtin/sparse-checkout.c             | 482 ++++++++++++++++++++++++++
 cache.h                               |   6 +-
 config.c                              |   5 +
 dir.c                                 | 207 ++++++++++-
 dir.h                                 |  36 ++
 environment.c                         |   1 +
 git.c                                 |   1 +
 t/t1091-sparse-checkout-builtin.sh    | 279 +++++++++++++++
 unpack-trees.c                        | 110 ++++--
 unpack-trees.h                        |   3 +-
 19 files changed, 1285 insertions(+), 48 deletions(-)
 create mode 100644 Documentation/git-sparse-checkout.txt
 create mode 100644 builtin/sparse-checkout.c
 create mode 100755 t/t1091-sparse-checkout-builtin.sh


base-commit: 108b97dc372828f0e72e56bbb40cae8e1e83ece6
Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-316%2Fderrickstolee%2Fsparse-checkout%2Fupstream-v4
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-316/derrickstolee/sparse-checkout/upstream-v4
Pull-Request: https://github.com/gitgitgadget/git/pull/316

Range-diff vs v3:

  1:  30a0db68cd !  1:  63626e1097 sparse-checkout: create builtin with 'list' subcommand
     @@ -75,6 +75,8 @@
      +Initialize and modify the sparse-checkout configuration, which reduces
      +the checkout to a set of directories given by a list of prefixes.
      +
     ++THIS COMMAND IS EXPERIMENTAL. THE BEHAVIOR MAY CHANGE.
     ++
      +
      +COMMANDS
      +--------
     @@ -96,7 +98,7 @@
      +The `$GIT_DIR/info/sparse-checkout` file is used to define the
      +skip-worktree reference bitmap. When Git updates the working
      +directory, it updates the skip-worktree bits in the index based
     -+ont this file. The files matching the patterns in the file will
     ++on this file. The files matching the patterns in the file will
      +appear in the working directory, and the rest will not.
      +
      +## FULL PATTERN SET
  2:  08bb6fb7f3 !  2:  65d26de1c2 sparse-checkout: create 'init' subcommand
     @@ -32,10 +32,9 @@
      +	by Git. Add patterns to the sparse-checkout file to
      +	repopulate the working directory.
      ++
     -+The init subcommand also enables the 'extensions.worktreeConfig' setting
     -+and sets the `core.sparseCheckout` setting in the worktree-specific config
     -+file. This prevents the sparse-checkout feature from interfering with other
     -+worktrees.
     ++To avoid interfering with other worktrees, it first enables the
     ++`extensions.worktreeConfig` setting and makes sure to set the
     ++`core.sparseCheckout` setting in the worktree-specific config file.
       
       SPARSE CHECKOUT
       ----------------
     @@ -72,8 +71,8 @@
      +}
      +
      +enum sparse_checkout_mode {
     -+	MODE_NONE = 0,
     -+	MODE_FULL = 1,
     ++	MODE_NO_PATTERNS = 0,
     ++	MODE_ALL_PATTERNS = 1,
      +};
      +
      +static int sc_set_config(enum sparse_checkout_mode mode)
     @@ -107,7 +106,7 @@
      +	FILE *fp;
      +	int res;
      +
     -+	if (sc_set_config(MODE_FULL))
     ++	if (sc_set_config(MODE_ALL_PATTERNS))
      +		return 1;
      +
      +	memset(&pl, 0, sizeof(pl));
  3:  c8587a1fb0 !  3:  e59ed7128f clone: add --sparse mode
     @@ -22,8 +22,9 @@
      
          During the 'git sparse-checkout init' call, we must first look
          to see if HEAD is valid, since 'git clone' does not have a valid
     -    HEAD. The first checkout will create the HEAD ref and update the
     -    working directory correctly.
     +    HEAD at the point where it initializes the sparse-checkout. The
     +    following checkout within the clone command will create the HEAD
     +    ref and update the working directory correctly.
      
          Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
      
     @@ -121,7 +122,7 @@
       	int res;
      +	struct object_id oid;
       
     - 	if (sc_set_config(MODE_FULL))
     + 	if (sc_set_config(MODE_ALL_PATTERNS))
       		return 1;
      @@
       	fprintf(fp, "/*\n!/*/\n");
  4:  6ce1d60b38 !  4:  502b4b08f0 sparse-checkout: 'set' subcommand
     @@ -17,8 +17,8 @@
       --- a/Documentation/git-sparse-checkout.txt
       +++ b/Documentation/git-sparse-checkout.txt
      @@
     - file. This prevents the sparse-checkout feature from interfering with other
     - worktrees.
     + `extensions.worktreeConfig` setting and makes sure to set the
     + `core.sparseCheckout` setting in the worktree-specific config file.
       
      +'set'::
      +	Write a set of patterns to the sparse-checkout file, as given as
  5:  0b1ed06bc8 =  5:  2852cf8e11 sparse-checkout: add '--stdin' option to set subcommand
  6:  22b9bd21f4 !  6:  55f95f290e sparse-checkout: create 'disable' subcommand
     @@ -24,7 +24,7 @@
       ----------------
       
      @@
     - ont this file. The files matching the patterns in the file will
     + on this file. The files matching the patterns in the file will
       appear in the working directory, and the rest will not.
       
      +To enable the sparse-checkout feature, run `git sparse-checkout init` to
     @@ -82,7 +82,7 @@
      +	char *sparse_filename;
      +	FILE *fp;
      +
     -+	if (sc_set_config(MODE_FULL))
     ++	if (sc_set_config(MODE_ALL_PATTERNS))
      +		die(_("failed to change config"));
      +
      +	sparse_filename = get_sparse_checkout_filename();
     @@ -96,7 +96,7 @@
      +	unlink(sparse_filename);
      +	free(sparse_filename);
      +
     -+	return sc_set_config(MODE_NONE);
     ++	return sc_set_config(MODE_NO_PATTERNS);
      +}
      +
       int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
  7:  2c53ea13d0 =  7:  3d8f2f2007 trace2: add region in clear_ce_flags
  8:  a66ec1affc =  8:  03dc0ed716 sparse-checkout: add 'cone' mode
  9:  431933bec6 !  9:  28606a152c sparse-checkout: use hashmaps for cone patterns
     @@ -33,6 +33,8 @@
          While this example is contrived, it demonstrates how these
          patterns can slow the sparse-checkout feature.
      
     +    Helped-by: Eric Wong <e@80x24.org>
     +    Helped-by: Johannes Schindelin <Johannes.Schindelin@gmx.de>
          Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
      
       diff --git a/dir.c b/dir.c
     @@ -43,10 +45,14 @@
       }
       
      +static int pl_hashmap_cmp(const void *unused_cmp_data,
     -+			  const void *a, const void *b, const void *key)
     ++			  const struct hashmap_entry *a,
     ++			  const struct hashmap_entry *b,
     ++			  const void *key)
      +{
     -+	const struct pattern_entry *ee1 = (const struct pattern_entry *)a;
     -+	const struct pattern_entry *ee2 = (const struct pattern_entry *)b;
     ++	const struct pattern_entry *ee1 =
     ++			container_of(a, struct pattern_entry, ent);
     ++	const struct pattern_entry *ee2 =
     ++			container_of(b, struct pattern_entry, ent);
      +
      +	size_t min_len = ee1->patternlen <= ee2->patternlen
      +			 ? ee1->patternlen
     @@ -91,10 +97,11 @@
      +		translated = xmalloc(sizeof(struct pattern_entry));
      +		translated->pattern = truncated;
      +		translated->patternlen = given->patternlen - 2;
     -+		hashmap_entry_init(translated,
     ++		hashmap_entry_init(&translated->ent,
      +				   memhash(translated->pattern, translated->patternlen));
      +
     -+		if (!hashmap_get(&pl->recursive_hashmap, translated, NULL)) {
     ++		if (!hashmap_get_entry(&pl->recursive_hashmap,
     ++				       translated, ent, NULL)) {
      +			/* We did not see the "parent" included */
      +			warning(_("unrecognized negative pattern: '%s'"),
      +				given->pattern);
     @@ -103,8 +110,8 @@
      +			goto clear_hashmaps;
      +		}
      +
     -+		hashmap_add(&pl->parent_hashmap, translated);
     -+		hashmap_remove(&pl->recursive_hashmap, translated, &data);
     ++		hashmap_add(&pl->parent_hashmap, &translated->ent);
     ++		hashmap_remove(&pl->recursive_hashmap, &translated->ent, &data);
      +		free(data);
      +		return;
      +	}
     @@ -119,16 +126,16 @@
      +
      +	translated->pattern = xstrdup(given->pattern);
      +	translated->patternlen = given->patternlen;
     -+	hashmap_entry_init(translated,
     ++	hashmap_entry_init(&translated->ent,
      +			   memhash(translated->pattern, translated->patternlen));
      +
     -+	hashmap_add(&pl->recursive_hashmap, translated);
     ++	hashmap_add(&pl->recursive_hashmap, &translated->ent);
      +
     -+	if (hashmap_get(&pl->parent_hashmap, translated, NULL)) {
     ++	if (hashmap_get_entry(&pl->parent_hashmap, translated, ent, NULL)) {
      +		/* we already included this at the parent level */
      +		warning(_("your sparse-checkout file may have issues: pattern '%s' is repeated"),
      +			given->pattern);
     -+		hashmap_remove(&pl->parent_hashmap, translated, &data);
     ++		hashmap_remove(&pl->parent_hashmap, &translated->ent, &data);
      +		free(data);
      +		free(translated);
      +	}
     @@ -137,8 +144,8 @@
      +
      +clear_hashmaps:
      +	warning(_("disabling cone pattern matching"));
     -+	hashmap_free(&pl->parent_hashmap, 1);
     -+	hashmap_free(&pl->recursive_hashmap, 1);
     ++	hashmap_free_entries(&pl->parent_hashmap, struct pattern_entry, ent);
     ++	hashmap_free_entries(&pl->recursive_hashmap, struct pattern_entry, ent);
      +	pl->use_cone_patterns = 0;
      +}
      +
     @@ -150,8 +157,8 @@
      +	/* Check straight mapping */
      +	p.pattern = pattern->buf;
      +	p.patternlen = pattern->len;
     -+	hashmap_entry_init(&p, memhash(p.pattern, p.patternlen));
     -+	return !!hashmap_get(map, &p, NULL);
     ++	hashmap_entry_init(&p.ent, memhash(p.pattern, p.patternlen));
     ++	return !!hashmap_get_entry(map, &p, ent, NULL);
      +}
      +
      +int hashmap_contains_parent(struct hashmap *map,
 10:  69bd707e96 ! 10:  2ea3d8819b sparse-checkout: init and set in cone mode
     @@ -18,6 +18,8 @@
          we want to avoid the warning that the patterns do not match
          the cone-mode patterns.
      
     +    Helped-by: Eric Wong <e@80x24.org>
     +    Helped-by: Johannes Schindelin <Johannes.Schindelin@gmx.de>
          Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
      
       diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
     @@ -33,9 +35,9 @@
       	N_("git sparse-checkout [init|list|set|disable] <options>"),
      @@
       enum sparse_checkout_mode {
     - 	MODE_NONE = 0,
     - 	MODE_FULL = 1,
     -+	MODE_CONE = 2,
     + 	MODE_NO_PATTERNS = 0,
     + 	MODE_ALL_PATTERNS = 1,
     ++	MODE_CONE_PATTERNS = 2,
       };
       
       static int sc_set_config(enum sparse_checkout_mode mode)
     @@ -52,7 +54,7 @@
      +	argv_array_pushl(&cone_argv, "config", "--worktree",
      +			 "core.sparseCheckoutCone", NULL);
      +
     -+	if (mode == MODE_CONE)
     ++	if (mode == MODE_CONE_PATTERNS)
      +		argv_array_push(&cone_argv, "true");
      +	else
      +		argv_array_push(&cone_argv, "false");
     @@ -83,7 +85,7 @@
       	struct object_id oid;
      +	int mode;
       
     --	if (sc_set_config(MODE_FULL))
     +-	if (sc_set_config(MODE_ALL_PATTERNS))
      +	static struct option builtin_sparse_checkout_init_options[] = {
      +		OPT_BOOL(0, "cone", &init_opts.cone_mode,
      +			 N_("initialize the sparse-checkout in cone mode")),
     @@ -94,7 +96,7 @@
      +			     builtin_sparse_checkout_init_options,
      +			     builtin_sparse_checkout_init_usage, 0);
      +
     -+	mode = init_opts.cone_mode ? MODE_CONE : MODE_FULL;
     ++	mode = init_opts.cone_mode ? MODE_CONE_PATTERNS : MODE_ALL_PATTERNS;
      +
      +	if (sc_set_config(mode))
       		return 1;
     @@ -106,12 +108,12 @@
       
      +static void insert_recursive_pattern(struct pattern_list *pl, struct strbuf *path)
      +{
     -+	struct pattern_entry *e = xmalloc(sizeof(struct pattern_entry));
     ++	struct pattern_entry *e = xmalloc(sizeof(*e));
      +	e->patternlen = path->len;
      +	e->pattern = strbuf_detach(path, NULL);
     -+	hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
     ++	hashmap_entry_init(&e->ent, memhash(e->pattern, e->patternlen));
      +
     -+	hashmap_add(&pl->recursive_hashmap, e);
     ++	hashmap_add(&pl->recursive_hashmap, &e->ent);
      +
      +	while (e->patternlen) {
      +		char *slash = strrchr(e->pattern, '/');
     @@ -125,23 +127,22 @@
      +		e = xmalloc(sizeof(struct pattern_entry));
      +		e->patternlen = newlen;
      +		e->pattern = xstrndup(oldpattern, newlen);
     -+		hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
     ++		hashmap_entry_init(&e->ent, memhash(e->pattern, e->patternlen));
      +
     -+		if (!hashmap_get(&pl->parent_hashmap, e, NULL))
     -+			hashmap_add(&pl->parent_hashmap, e);
     ++		if (!hashmap_get_entry(&pl->parent_hashmap, e, ent, NULL))
     ++			hashmap_add(&pl->parent_hashmap, &e->ent);
      +	}
      +}
      +
      +static void write_cone_to_file(FILE *fp, struct pattern_list *pl)
      +{
      +	int i;
     -+	struct pattern_entry *entry;
     ++	struct pattern_entry *pe;
      +	struct hashmap_iter iter;
      +	struct string_list sl = STRING_LIST_INIT_DUP;
      +
     -+	hashmap_iter_init(&pl->parent_hashmap, &iter);
     -+	while ((entry = hashmap_iter_next(&iter)))
     -+		string_list_insert(&sl, entry->pattern);
     ++	hashmap_for_each_entry(&pl->parent_hashmap, &iter, pe, ent)
     ++		string_list_insert(&sl, pe->pattern);
      +
      +	string_list_sort(&sl);
      +	string_list_remove_duplicates(&sl, 0);
     @@ -157,9 +158,8 @@
      +
      +	string_list_clear(&sl, 0);
      +
     -+	hashmap_iter_init(&pl->recursive_hashmap, &iter);
     -+	while ((entry = hashmap_iter_next(&iter)))
     -+		string_list_insert(&sl, entry->pattern);
     ++	hashmap_for_each_entry(&pl->recursive_hashmap, &iter, pe, ent)
     ++		string_list_insert(&sl, pe->pattern);
      +
      +	string_list_sort(&sl);
      +	string_list_remove_duplicates(&sl, 0);
     @@ -260,12 +260,16 @@
       }
       
      -static int pl_hashmap_cmp(const void *unused_cmp_data,
     --			  const void *a, const void *b, const void *key)
     +-			  const struct hashmap_entry *a,
     +-			  const struct hashmap_entry *b,
     +-			  const void *key)
      +int pl_hashmap_cmp(const void *unused_cmp_data,
     -+		   const void *a, const void *b, const void *key)
     ++		   const struct hashmap_entry *a,
     ++		   const struct hashmap_entry *b,
     ++		   const void *key)
       {
     - 	const struct pattern_entry *ee1 = (const struct pattern_entry *)a;
     - 	const struct pattern_entry *ee2 = (const struct pattern_entry *)b;
     + 	const struct pattern_entry *ee1 =
     + 			container_of(a, struct pattern_entry, ent);
      
       diff --git a/dir.h b/dir.h
       --- a/dir.h
     @@ -275,7 +279,9 @@
       		const char *name, int *dtype);
       
      +int pl_hashmap_cmp(const void *unused_cmp_data,
     -+		   const void *a, const void *b, const void *key);
     ++		   const struct hashmap_entry *a,
     ++		   const struct hashmap_entry *b,
     ++		   const void *key);
       int hashmap_contains_parent(struct hashmap *map,
       			    const char *path,
       			    struct strbuf *buffer);
     @@ -301,11 +307,13 @@
      +		a
      +		deep
      +	EOF
     ++	test_cmp expect dir &&
      +	ls repo/deep >dir  &&
      +	cat >expect <<-EOF &&
      +		a
      +		deeper1
      +	EOF
     ++	test_cmp expect dir &&
      +	ls repo/deep/deeper1 >dir  &&
      +	cat >expect <<-EOF &&
      +		a
 11:  e06349fcec = 11:  fefd1e1744 unpack-trees: hash less in cone mode
 12:  3ef32084f5 = 12:  034fbd31bd unpack-trees: add progress to clear_ce_flags()
 13:  3a677f32b6 = 13:  27aa9d22f0 read-tree: show progress by default
 14:  56444a5498 ! 14:  44c3078029 sparse-checkout: sanitize for nested folders
     @@ -22,43 +22,44 @@
          hashmap_contains_parent() method. It takes a strbuf buffer to
          avoid reallocating a buffer when calling in a tight loop.
      
     +    Helped-by: Eric Wong <e@80x24.org>
     +    Helped-by: Johannes Schindelin <Johannes.Schindelin@gmx.de>
          Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
      
       diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
       --- a/builtin/sparse-checkout.c
       +++ b/builtin/sparse-checkout.c
      @@
     - 	struct pattern_entry *entry;
     + 	struct pattern_entry *pe;
       	struct hashmap_iter iter;
       	struct string_list sl = STRING_LIST_INIT_DUP;
      +	struct strbuf parent_pattern = STRBUF_INIT;
       
     - 	hashmap_iter_init(&pl->parent_hashmap, &iter);
     --	while ((entry = hashmap_iter_next(&iter)))
     --		string_list_insert(&sl, entry->pattern);
     -+	while ((entry = hashmap_iter_next(&iter))) {
     -+		if (hashmap_get(&pl->recursive_hashmap, entry, NULL))
     +-	hashmap_for_each_entry(&pl->parent_hashmap, &iter, pe, ent)
     +-		string_list_insert(&sl, pe->pattern);
     ++	hashmap_for_each_entry(&pl->parent_hashmap, &iter, pe, ent) {
     ++		if (hashmap_get_entry(&pl->recursive_hashmap, pe, ent, NULL))
      +			continue;
      +
      +		if (!hashmap_contains_parent(&pl->recursive_hashmap,
     -+					     entry->pattern,
     ++					     pe->pattern,
      +					     &parent_pattern))
     -+			string_list_insert(&sl, entry->pattern);
     ++			string_list_insert(&sl, pe->pattern);
      +	}
       
       	string_list_sort(&sl);
       	string_list_remove_duplicates(&sl, 0);
      @@
     + 
       	string_list_clear(&sl, 0);
       
     - 	hashmap_iter_init(&pl->recursive_hashmap, &iter);
     --	while ((entry = hashmap_iter_next(&iter)))
     --		string_list_insert(&sl, entry->pattern);
     -+	while ((entry = hashmap_iter_next(&iter))) {
     +-	hashmap_for_each_entry(&pl->recursive_hashmap, &iter, pe, ent)
     +-		string_list_insert(&sl, pe->pattern);
     ++	hashmap_for_each_entry(&pl->recursive_hashmap, &iter, pe, ent) {
      +		if (!hashmap_contains_parent(&pl->recursive_hashmap,
     -+					     entry->pattern,
     ++					     pe->pattern,
      +					     &parent_pattern))
     -+			string_list_insert(&sl, entry->pattern);
     ++			string_list_insert(&sl, pe->pattern);
      +	}
      +
      +	strbuf_release(&parent_pattern);
 15:  a6f17e9a77 ! 15:  9ccec3ca9a sparse-checkout: update working directory in-process
     @@ -10,7 +10,7 @@
      
          Remove this extra process call by creating a direct call to
          unpack_trees() in the same way 'git read-tree -mu HEAD' does. In
     -    adition, provide an in-memory list of patterns so we can avoid
     +    addition, provide an in-memory list of patterns so we can avoid
          reading from the sparse-checkout file. This allows us to test a
          proposed change to the file before writing to it.
      
     @@ -109,12 +109,12 @@
       			     builtin_sparse_checkout_init_options,
       			     builtin_sparse_checkout_init_usage, 0);
       
     --	mode = init_opts.cone_mode ? MODE_CONE : MODE_FULL;
     +-	mode = init_opts.cone_mode ? MODE_CONE_PATTERNS : MODE_ALL_PATTERNS;
      +	if (init_opts.cone_mode) {
     -+		mode = MODE_CONE;
     ++		mode = MODE_CONE_PATTERNS;
      +		core_sparse_checkout_cone = 1;
      +	} else
     -+		mode = MODE_FULL;
     ++		mode = MODE_ALL_PATTERNS;
       
       	if (sc_set_config(mode))
       		return 1;
     @@ -128,20 +128,12 @@
       }
       
       static void insert_recursive_pattern(struct pattern_list *pl, struct strbuf *path)
     - {
     --	struct pattern_entry *e = xmalloc(sizeof(struct pattern_entry));
     -+	struct pattern_entry *e = xmalloc(sizeof(*e));
     -+
     - 	e->patternlen = path->len;
     - 	e->pattern = strbuf_detach(path, NULL);
     - 	hashmap_entry_init(e, memhash(e->pattern, e->patternlen));
      @@
       {
       	char *sparse_filename;
       	FILE *fp;
     --
      +	int result;
     -+	
     + 
       	if (!core_apply_sparse_checkout) {
       		warning(_("core.sparseCheckout is disabled, so changes to the sparse-checkout file will have no effect"));
       		warning(_("run 'git sparse-checkout init' to enable the sparse-checkout feature"));
     @@ -193,20 +185,6 @@
       diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
       --- a/t/t1091-sparse-checkout-builtin.sh
       +++ b/t/t1091-sparse-checkout-builtin.sh
     -@@
     - 		a
     - 		deep
     - 	EOF
     -+	test_cmp dir expect &&
     - 	ls repo/deep >dir  &&
     - 	cat >expect <<-EOF &&
     - 		a
     - 		deeper1
     - 	EOF
     -+	test_cmp dir expect &&
     - 	ls repo/deep/deeper1 >dir  &&
     - 	cat >expect <<-EOF &&
     - 		a
      @@
       	test_cmp repo/.git/info/sparse-checkout expect
       '
 16:  8927494b8c ! 16:  d0421ef7b2 sparse-checkout: write using lockfile
     @@ -23,7 +23,7 @@
      +	int fd;
      +	struct lock_file lk = LOCK_INIT;
       	int result;
     - 	
     + 
       	if (!core_apply_sparse_checkout) {
      @@
       
 17:  7f377c1407 = 17:  ed1f148763 sparse-checkout: cone mode should not interact with .gitignore

-- 
gitgitgadget

^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v4 01/17] sparse-checkout: create builtin with 'list' subcommand
  2019-10-15 13:55     ` [PATCH v4 " Derrick Stolee via GitGitGadget
@ 2019-10-15 13:55       ` Derrick Stolee via GitGitGadget
  2019-10-16 19:00         ` Elijah Newren
  2019-10-18 16:07         ` SZEDER Gábor
  2019-10-15 13:55       ` [PATCH v4 02/17] sparse-checkout: create 'init' subcommand Derrick Stolee via GitGitGadget
                         ` (18 subsequent siblings)
  19 siblings, 2 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-15 13:55 UTC (permalink / raw)
  To: git; +Cc: newren, Derrick Stolee, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The sparse-checkout feature is mostly hidden to users, as its
only documentation is supplementary information in the docs for
'git read-tree'. In addition, users need to know how to edit the
.git/info/sparse-checkout file with the right patterns, then run
the appropriate 'git read-tree -mu HEAD' command. Keeping the
working directory in sync with the sparse-checkout file requires
care.

Begin an effort to make the sparse-checkout feature a porcelain
feature by creating a new 'git sparse-checkout' builtin. This
builtin will be the preferred mechanism for manipulating the
sparse-checkout file and syncing the working directory.

The documentation provided is adapted from the "git read-tree"
documentation with a few edits for clarity in the new context.
Extra sections are added to hint toward a future change to
a more restricted pattern set.

Helped-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 .gitignore                            |  1 +
 Documentation/git-read-tree.txt       |  2 +-
 Documentation/git-sparse-checkout.txt | 87 +++++++++++++++++++++++++++
 Makefile                              |  1 +
 builtin.h                             |  1 +
 builtin/sparse-checkout.c             | 86 ++++++++++++++++++++++++++
 git.c                                 |  1 +
 t/t1091-sparse-checkout-builtin.sh    | 50 +++++++++++++++
 8 files changed, 228 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/git-sparse-checkout.txt
 create mode 100644 builtin/sparse-checkout.c
 create mode 100755 t/t1091-sparse-checkout-builtin.sh

diff --git a/.gitignore b/.gitignore
index 89b3b79c1a..aebe7c0908 100644
--- a/.gitignore
+++ b/.gitignore
@@ -158,6 +158,7 @@
 /git-show-branch
 /git-show-index
 /git-show-ref
+/git-sparse-checkout
 /git-stage
 /git-stash
 /git-status
diff --git a/Documentation/git-read-tree.txt b/Documentation/git-read-tree.txt
index d271842608..da33f84f33 100644
--- a/Documentation/git-read-tree.txt
+++ b/Documentation/git-read-tree.txt
@@ -436,7 +436,7 @@ support.
 SEE ALSO
 --------
 linkgit:git-write-tree[1]; linkgit:git-ls-files[1];
-linkgit:gitignore[5]
+linkgit:gitignore[5]; linkgit:git-sparse-checkout[1];
 
 GIT
 ---
diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
new file mode 100644
index 0000000000..46d3dc3cb1
--- /dev/null
+++ b/Documentation/git-sparse-checkout.txt
@@ -0,0 +1,87 @@
+git-sparse-checkout(1)
+=======================
+
+NAME
+----
+git-sparse-checkout - Initialize and modify the sparse-checkout
+configuration, which reduces the checkout to a set of directories
+given by a list of prefixes.
+
+
+SYNOPSIS
+--------
+[verse]
+'git sparse-checkout <subcommand> [options]'
+
+
+DESCRIPTION
+-----------
+
+Initialize and modify the sparse-checkout configuration, which reduces
+the checkout to a set of directories given by a list of prefixes.
+
+THIS COMMAND IS EXPERIMENTAL. THE BEHAVIOR MAY CHANGE.
+
+
+COMMANDS
+--------
+'list'::
+	Provide a list of the contents in the sparse-checkout file.
+
+
+SPARSE CHECKOUT
+----------------
+
+"Sparse checkout" allows populating the working directory sparsely.
+It uses the skip-worktree bit (see linkgit:git-update-index[1]) to tell
+Git whether a file in the working directory is worth looking at. If
+the skip-worktree bit is set, then the file is ignored in the working
+directory. Git will not populate the contents of those files, which
+makes a sparse checkout helpful when working in a repository with many
+files, but only a few are important to the current user.
+
+The `$GIT_DIR/info/sparse-checkout` file is used to define the
+skip-worktree reference bitmap. When Git updates the working
+directory, it updates the skip-worktree bits in the index based
+on this file. The files matching the patterns in the file will
+appear in the working directory, and the rest will not.
+
+## FULL PATTERN SET
+
+By default, the sparse-checkout file uses the same syntax as `.gitignore`
+files.
+
+While `$GIT_DIR/info/sparse-checkout` is usually used to specify what
+files are included, you can also specify what files are _not_ included,
+using negative patterns. For example, to remove the file `unwanted`:
+
+----------------
+/*
+!unwanted
+----------------
+
+Another tricky thing is fully repopulating the working directory when you
+no longer want sparse checkout. You cannot just disable "sparse
+checkout" because skip-worktree bits are still in the index and your working
+directory is still sparsely populated. You should re-populate the working
+directory with the `$GIT_DIR/info/sparse-checkout` file content as
+follows:
+
+----------------
+/*
+----------------
+
+Then you can disable sparse checkout. Sparse checkout support in 'git
+checkout' and similar commands is disabled by default. You need to
+set `core.sparseCheckout` to `true` in order to have sparse checkout
+support.
+
+SEE ALSO
+--------
+
+linkgit:git-read-tree[1]
+linkgit:gitignore[5]
+
+GIT
+---
+Part of the linkgit:git[1] suite
diff --git a/Makefile b/Makefile
index de60c8e7aa..adefc229fe 100644
--- a/Makefile
+++ b/Makefile
@@ -1125,6 +1125,7 @@ BUILTIN_OBJS += builtin/shortlog.o
 BUILTIN_OBJS += builtin/show-branch.o
 BUILTIN_OBJS += builtin/show-index.o
 BUILTIN_OBJS += builtin/show-ref.o
+BUILTIN_OBJS += builtin/sparse-checkout.o
 BUILTIN_OBJS += builtin/stash.o
 BUILTIN_OBJS += builtin/stripspace.o
 BUILTIN_OBJS += builtin/submodule--helper.o
diff --git a/builtin.h b/builtin.h
index 5cf5df69f7..2b25a80cde 100644
--- a/builtin.h
+++ b/builtin.h
@@ -225,6 +225,7 @@ int cmd_shortlog(int argc, const char **argv, const char *prefix);
 int cmd_show(int argc, const char **argv, const char *prefix);
 int cmd_show_branch(int argc, const char **argv, const char *prefix);
 int cmd_show_index(int argc, const char **argv, const char *prefix);
+int cmd_sparse_checkout(int argc, const char **argv, const char *prefix);
 int cmd_status(int argc, const char **argv, const char *prefix);
 int cmd_stash(int argc, const char **argv, const char *prefix);
 int cmd_stripspace(int argc, const char **argv, const char *prefix);
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
new file mode 100644
index 0000000000..eed9625a05
--- /dev/null
+++ b/builtin/sparse-checkout.c
@@ -0,0 +1,86 @@
+#include "builtin.h"
+#include "config.h"
+#include "dir.h"
+#include "parse-options.h"
+#include "pathspec.h"
+#include "repository.h"
+#include "run-command.h"
+#include "strbuf.h"
+
+static char const * const builtin_sparse_checkout_usage[] = {
+	N_("git sparse-checkout [list]"),
+	NULL
+};
+
+static char *get_sparse_checkout_filename(void)
+{
+	return git_pathdup("info/sparse-checkout");
+}
+
+static void write_patterns_to_file(FILE *fp, struct pattern_list *pl)
+{
+	int i;
+
+	for (i = 0; i < pl->nr; i++) {
+		struct path_pattern *p = pl->patterns[i];
+
+		if (p->flags & PATTERN_FLAG_NEGATIVE)
+			fprintf(fp, "!");
+
+		fprintf(fp, "%s", p->pattern);
+
+		if (p->flags & PATTERN_FLAG_MUSTBEDIR)
+			fprintf(fp, "/");
+
+		fprintf(fp, "\n");
+	}
+}
+
+static int sparse_checkout_list(int argc, const char **argv)
+{
+	struct pattern_list pl;
+	char *sparse_filename;
+	int res;
+
+	memset(&pl, 0, sizeof(pl));
+
+	sparse_filename = get_sparse_checkout_filename();
+	res = add_patterns_from_file_to_list(sparse_filename, "", 0, &pl, NULL);
+	free(sparse_filename);
+
+	if (res < 0) {
+		warning(_("this worktree is not sparse (sparse-checkout file may not exist)"));
+		return 0;
+	}
+
+	write_patterns_to_file(stdout, &pl);
+	clear_pattern_list(&pl);
+
+	return 0;
+}
+
+int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
+{
+	static struct option builtin_sparse_checkout_options[] = {
+		OPT_END(),
+	};
+
+	if (argc == 2 && !strcmp(argv[1], "-h"))
+		usage_with_options(builtin_sparse_checkout_usage,
+				   builtin_sparse_checkout_options);
+
+	argc = parse_options(argc, argv, prefix,
+			     builtin_sparse_checkout_options,
+			     builtin_sparse_checkout_usage,
+			     PARSE_OPT_STOP_AT_NON_OPTION);
+
+	git_config(git_default_config, NULL);
+
+	if (argc > 0) {
+		if (!strcmp(argv[0], "list"))
+			return sparse_checkout_list(argc, argv);
+	}
+
+	usage_with_options(builtin_sparse_checkout_usage,
+			   builtin_sparse_checkout_options);
+}
diff --git a/git.c b/git.c
index ce6ab0ece2..7be7ad34bd 100644
--- a/git.c
+++ b/git.c
@@ -572,6 +572,7 @@ static struct cmd_struct commands[] = {
 	{ "show-branch", cmd_show_branch, RUN_SETUP },
 	{ "show-index", cmd_show_index },
 	{ "show-ref", cmd_show_ref, RUN_SETUP },
+	{ "sparse-checkout", cmd_sparse_checkout, RUN_SETUP | NEED_WORK_TREE },
 	{ "stage", cmd_add, RUN_SETUP | NEED_WORK_TREE },
 	/*
 	 * NEEDSWORK: Until the builtin stash is thoroughly robust and no
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
new file mode 100755
index 0000000000..a9b04b1a88
--- /dev/null
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+test_description='sparse checkout builtin tests'
+
+. ./test-lib.sh
+
+test_expect_success 'setup' '
+	git init repo &&
+	(
+		cd repo &&
+		echo "initial" >a &&
+		mkdir folder1 folder2 deep &&
+		mkdir deep/deeper1 deep/deeper2 &&
+		mkdir deep/deeper1/deepest &&
+		cp a folder1 &&
+		cp a folder2 &&
+		cp a deep &&
+		cp a deep/deeper1 &&
+		cp a deep/deeper2 &&
+		cp a deep/deeper1/deepest &&
+		git add . &&
+		git commit -m "initial commit"
+	)
+'
+
+test_expect_success 'git sparse-checkout list (empty)' '
+	git -C repo sparse-checkout list >list 2>err &&
+	test_line_count = 0 list &&
+	test_i18ngrep "this worktree is not sparse (sparse-checkout file may not exist)" err
+'
+
+test_expect_success 'git sparse-checkout list (populated)' '
+	test_when_finished rm -f repo/.git/info/sparse-checkout &&
+	cat >repo/.git/info/sparse-checkout <<-EOF &&
+		/folder1/*
+		/deep/
+		**/a
+		!*bin*
+	EOF
+	git -C repo sparse-checkout list >list &&
+	cat >expect <<-EOF &&
+		/folder1/*
+		/deep/
+		**/a
+		!*bin*
+	EOF
+	test_cmp expect list
+'
+
+test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v4 02/17] sparse-checkout: create 'init' subcommand
  2019-10-15 13:55     ` [PATCH v4 " Derrick Stolee via GitGitGadget
  2019-10-15 13:55       ` [PATCH v4 01/17] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-15 13:55       ` Derrick Stolee via GitGitGadget
  2019-10-15 13:55       ` [PATCH v4 03/17] clone: add --sparse mode Derrick Stolee via GitGitGadget
                         ` (17 subsequent siblings)
  19 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-15 13:55 UTC (permalink / raw)
  To: git; +Cc: newren, Derrick Stolee, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

Getting started with a sparse-checkout file can be daunting. Help
users start their sparse enlistment using 'git sparse-checkout init'.
This will set 'core.sparseCheckout=true' in their config, write
an initial set of patterns to the sparse-checkout file, and update
their working directory.

Make sure to use the `extensions.worktreeConfig` setting and write
the sparse checkout config to the worktree-specific config file.
This avoids confusing interactions with other worktrees.

The use of running another process for 'git read-tree' is sub-
optimal. This will be removed in a later change.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-sparse-checkout.txt | 11 ++++
 builtin/sparse-checkout.c             | 79 ++++++++++++++++++++++++++-
 t/t1091-sparse-checkout-builtin.sh    | 41 ++++++++++++++
 3 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index 46d3dc3cb1..d5fbbf17a0 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -28,6 +28,17 @@ COMMANDS
 'list'::
 	Provide a list of the contents in the sparse-checkout file.
 
+'init'::
+	Enable the `core.sparseCheckout` setting. If the
+	sparse-checkout file does not exist, then populate it with
+	patterns that match every file in the root directory and
+	no other directories, then will remove all directories tracked
+	by Git. Add patterns to the sparse-checkout file to
+	repopulate the working directory.
++
+To avoid interfering with other worktrees, it first enables the
+`extensions.worktreeConfig` setting and makes sure to set the
+`core.sparseCheckout` setting in the worktree-specific config file.
 
 SPARSE CHECKOUT
 ----------------
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index eed9625a05..1d2327111a 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -8,7 +8,7 @@
 #include "strbuf.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
-	N_("git sparse-checkout [list]"),
+	N_("git sparse-checkout [init|list]"),
 	NULL
 };
 
@@ -59,6 +59,81 @@ static int sparse_checkout_list(int argc, const char **argv)
 	return 0;
 }
 
+static int update_working_directory(void)
+{
+	struct argv_array argv = ARGV_ARRAY_INIT;
+	int result = 0;
+	argv_array_pushl(&argv, "read-tree", "-m", "-u", "HEAD", NULL);
+
+	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to update index with new sparse-checkout paths"));
+		result = 1;
+	}
+
+	argv_array_clear(&argv);
+	return result;
+}
+
+enum sparse_checkout_mode {
+	MODE_NO_PATTERNS = 0,
+	MODE_ALL_PATTERNS = 1,
+};
+
+static int sc_set_config(enum sparse_checkout_mode mode)
+{
+	struct argv_array argv = ARGV_ARRAY_INIT;
+
+	if (git_config_set_gently("extensions.worktreeConfig", "true")) {
+		error(_("failed to set extensions.worktreeConfig setting"));
+		return 1;
+	}
+
+	argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", NULL);
+
+	if (mode)
+		argv_array_pushl(&argv, "true", NULL);
+	else
+		argv_array_pushl(&argv, "false", NULL);
+
+	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to enable core.sparseCheckout"));
+		return 1;
+	}
+
+	return 0;
+}
+
+static int sparse_checkout_init(int argc, const char **argv)
+{
+	struct pattern_list pl;
+	char *sparse_filename;
+	FILE *fp;
+	int res;
+
+	if (sc_set_config(MODE_ALL_PATTERNS))
+		return 1;
+
+	memset(&pl, 0, sizeof(pl));
+
+	sparse_filename = get_sparse_checkout_filename();
+	res = add_patterns_from_file_to_list(sparse_filename, "", 0, &pl, NULL);
+
+	/* If we already have a sparse-checkout file, use it. */
+	if (res >= 0) {
+		free(sparse_filename);
+		goto reset_dir;
+	}
+
+	/* initial mode: all blobs at root */
+	fp = fopen(sparse_filename, "w");
+	free(sparse_filename);
+	fprintf(fp, "/*\n!/*/\n");
+	fclose(fp);
+
+reset_dir:
+	return update_working_directory();
+}
+
 int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 {
 	static struct option builtin_sparse_checkout_options[] = {
@@ -79,6 +154,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 	if (argc > 0) {
 		if (!strcmp(argv[0], "list"))
 			return sparse_checkout_list(argc, argv);
+		if (!strcmp(argv[0], "init"))
+			return sparse_checkout_init(argc, argv);
 	}
 
 	usage_with_options(builtin_sparse_checkout_usage,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index a9b04b1a88..c70085a759 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -47,4 +47,45 @@ test_expect_success 'git sparse-checkout list (populated)' '
 	test_cmp expect list
 '
 
+test_expect_success 'git sparse-checkout init' '
+	git -C repo sparse-checkout init &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+	EOF
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	git -C repo config --list >config &&
+	test_i18ngrep "core.sparsecheckout=true" config &&
+	ls repo >dir  &&
+	echo a >expect &&
+	test_cmp expect dir
+'
+
+test_expect_success 'git sparse-checkout list after init' '
+	git -C repo sparse-checkout list >actual &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+	EOF
+	test_cmp expect actual
+'
+
+test_expect_success 'init with existing sparse-checkout' '
+	echo "*folder*" >> repo/.git/info/sparse-checkout &&
+	git -C repo sparse-checkout init &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+		*folder*
+	EOF
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v4 03/17] clone: add --sparse mode
  2019-10-15 13:55     ` [PATCH v4 " Derrick Stolee via GitGitGadget
  2019-10-15 13:55       ` [PATCH v4 01/17] sparse-checkout: create builtin with 'list' subcommand Derrick Stolee via GitGitGadget
  2019-10-15 13:55       ` [PATCH v4 02/17] sparse-checkout: create 'init' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-15 13:55       ` Derrick Stolee via GitGitGadget
  2019-10-15 13:55       ` [PATCH v4 04/17] sparse-checkout: 'set' subcommand Derrick Stolee via GitGitGadget
                         ` (16 subsequent siblings)
  19 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-15 13:55 UTC (permalink / raw)
  To: git; +Cc: newren, Derrick Stolee, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

When someone wants to clone a large repository, but plans to work
using a sparse-checkout file, they either need to do a full
checkout first and then reduce the patterns they included, or
clone with --no-checkout, set up their patterns, and then run
a checkout manually. This requires knowing a lot about the repo
shape and how sparse-checkout works.

Add a new '--sparse' option to 'git clone' that initializes the
sparse-checkout file to include the following patterns:

	/*
	!/*/

These patterns include every file in the root directory, but
no directories. This allows a repo to include files like a
README or a bootstrapping script to grow enlistments from that
point.

During the 'git sparse-checkout init' call, we must first look
to see if HEAD is valid, since 'git clone' does not have a valid
HEAD at the point where it initializes the sparse-checkout. The
following checkout within the clone command will create the HEAD
ref and update the working directory correctly.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-clone.txt        |  8 +++++++-
 builtin/clone.c                    | 27 +++++++++++++++++++++++++++
 builtin/sparse-checkout.c          |  6 ++++++
 t/t1091-sparse-checkout-builtin.sh | 13 +++++++++++++
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt
index 34011c2940..0fe91d2f04 100644
--- a/Documentation/git-clone.txt
+++ b/Documentation/git-clone.txt
@@ -15,7 +15,7 @@ SYNOPSIS
 	  [--dissociate] [--separate-git-dir <git dir>]
 	  [--depth <depth>] [--[no-]single-branch] [--no-tags]
 	  [--recurse-submodules[=<pathspec>]] [--[no-]shallow-submodules]
-	  [--[no-]remote-submodules] [--jobs <n>] [--] <repository>
+	  [--[no-]remote-submodules] [--jobs <n>] [--sparse] [--] <repository>
 	  [<directory>]
 
 DESCRIPTION
@@ -156,6 +156,12 @@ objects from the source repository into a pack in the cloned repository.
 	used, neither remote-tracking branches nor the related
 	configuration variables are created.
 
+--sparse::
+	Initialize the sparse-checkout file so the working
+	directory starts with only the files in the root
+	of the repository. The sparse-checkout file can be
+	modified to grow the working directory as needed.
+
 --mirror::
 	Set up a mirror of the source repository.  This implies `--bare`.
 	Compared to `--bare`, `--mirror` not only maps local branches of the
diff --git a/builtin/clone.c b/builtin/clone.c
index c46ee29f0a..4348d962c9 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -59,6 +59,7 @@ static const char *real_git_dir;
 static char *option_upload_pack = "git-upload-pack";
 static int option_verbosity;
 static int option_progress = -1;
+static int option_sparse_checkout;
 static enum transport_family family;
 static struct string_list option_config = STRING_LIST_INIT_NODUP;
 static struct string_list option_required_reference = STRING_LIST_INIT_NODUP;
@@ -146,6 +147,8 @@ static struct option builtin_clone_options[] = {
 	OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
 	OPT_BOOL(0, "remote-submodules", &option_remote_submodules,
 		    N_("any cloned submodules will use their remote-tracking branch")),
+	OPT_BOOL(0, "sparse", &option_sparse_checkout,
+		    N_("initialize sparse-checkout file to include only files at root")),
 	OPT_END()
 };
 
@@ -733,6 +736,27 @@ static void update_head(const struct ref *our, const struct ref *remote,
 	}
 }
 
+static int git_sparse_checkout_init(const char *repo)
+{
+	struct argv_array argv = ARGV_ARRAY_INIT;
+	int result = 0;
+	argv_array_pushl(&argv, "-C", repo, "sparse-checkout", "init", NULL);
+
+	/*
+	 * We must apply the setting in the current process
+	 * for the later checkout to use the sparse-checkout file.
+	 */
+	core_apply_sparse_checkout = 1;
+
+	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to initialize sparse-checkout"));
+		result = 1;
+	}
+
+	argv_array_clear(&argv);
+	return result;
+}
+
 static int checkout(int submodule_progress)
 {
 	struct object_id oid;
@@ -1106,6 +1130,9 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
 	if (option_required_reference.nr || option_optional_reference.nr)
 		setup_reference();
 
+	if (option_sparse_checkout && git_sparse_checkout_init(repo))
+		return 1;
+
 	remote = remote_get(option_origin);
 
 	strbuf_addf(&default_refspec, "+%s*:%s*", src_ref_prefix,
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 1d2327111a..4198995d46 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -109,6 +109,7 @@ static int sparse_checkout_init(int argc, const char **argv)
 	char *sparse_filename;
 	FILE *fp;
 	int res;
+	struct object_id oid;
 
 	if (sc_set_config(MODE_ALL_PATTERNS))
 		return 1;
@@ -130,6 +131,11 @@ static int sparse_checkout_init(int argc, const char **argv)
 	fprintf(fp, "/*\n!/*/\n");
 	fclose(fp);
 
+	if (get_oid("HEAD", &oid)) {
+		/* assume we are in a fresh repo */
+		return 0;
+	}
+
 reset_dir:
 	return update_working_directory();
 }
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index c70085a759..d4c145a3af 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -88,4 +88,17 @@ test_expect_success 'init with existing sparse-checkout' '
 	test_cmp expect dir
 '
 
+test_expect_success 'clone --sparse' '
+	git clone --sparse repo clone &&
+	git -C clone sparse-checkout list >actual &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+	EOF
+	test_cmp expect actual &&
+	ls clone >dir &&
+	echo a >expect &&
+	test_cmp expect dir
+'
+
 test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v4 04/17] sparse-checkout: 'set' subcommand
  2019-10-15 13:55     ` [PATCH v4 " Derrick Stolee via GitGitGadget
                         ` (2 preceding siblings ...)
  2019-10-15 13:55       ` [PATCH v4 03/17] clone: add --sparse mode Derrick Stolee via GitGitGadget
@ 2019-10-15 13:55       ` Derrick Stolee via GitGitGadget
  2019-10-15 13:55       ` [PATCH v4 05/17] sparse-checkout: add '--stdin' option to set subcommand Derrick Stolee via GitGitGadget
                         ` (15 subsequent siblings)
  19 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-15 13:55 UTC (permalink / raw)
  To: git; +Cc: newren, Derrick Stolee, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The 'git sparse-checkout set' subcommand takes a list of patterns
as arguments and writes them to the sparse-checkout file. Then, it
updates the working directory using 'git read-tree -mu HEAD'.

The 'set' subcommand will replace the entire contents of the
sparse-checkout file. The write_patterns_and_update() method is
extracted from cmd_sparse_checkout() to make it easier to implement
'add' and/or 'remove' subcommands in the future.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-sparse-checkout.txt |  5 ++++
 builtin/sparse-checkout.c             | 35 ++++++++++++++++++++++++++-
 t/t1091-sparse-checkout-builtin.sh    | 19 +++++++++++++++
 3 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index d5fbbf17a0..163f6c8db0 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -40,6 +40,11 @@ To avoid interfering with other worktrees, it first enables the
 `extensions.worktreeConfig` setting and makes sure to set the
 `core.sparseCheckout` setting in the worktree-specific config file.
 
+'set'::
+	Write a set of patterns to the sparse-checkout file, as given as
+	a list of arguments following the 'set' subcommand. Update the
+	working directory to match the new patterns.
+
 SPARSE CHECKOUT
 ----------------
 
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 4198995d46..2103cbe00c 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -8,7 +8,7 @@
 #include "strbuf.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
-	N_("git sparse-checkout [init|list]"),
+	N_("git sparse-checkout [init|list|set] <options>"),
 	NULL
 };
 
@@ -140,6 +140,37 @@ static int sparse_checkout_init(int argc, const char **argv)
 	return update_working_directory();
 }
 
+static int write_patterns_and_update(struct pattern_list *pl)
+{
+	char *sparse_filename;
+	FILE *fp;
+
+	sparse_filename = get_sparse_checkout_filename();
+	fp = fopen(sparse_filename, "w");
+	write_patterns_to_file(fp, pl);
+	fclose(fp);
+	free(sparse_filename);
+
+	return update_working_directory();
+}
+
+static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
+{
+	static const char *empty_base = "";
+	int i;
+	struct pattern_list pl;
+	int result;
+	memset(&pl, 0, sizeof(pl));
+
+	for (i = 1; i < argc; i++)
+		add_pattern(argv[i], empty_base, 0, &pl, 0);
+
+	result = write_patterns_and_update(&pl);
+
+	clear_pattern_list(&pl);
+	return result;
+}
+
 int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 {
 	static struct option builtin_sparse_checkout_options[] = {
@@ -162,6 +193,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 			return sparse_checkout_list(argc, argv);
 		if (!strcmp(argv[0], "init"))
 			return sparse_checkout_init(argc, argv);
+		if (!strcmp(argv[0], "set"))
+			return sparse_checkout_set(argc, argv, prefix);
 	}
 
 	usage_with_options(builtin_sparse_checkout_usage,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index d4c145a3af..19e8673c6b 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -101,4 +101,23 @@ test_expect_success 'clone --sparse' '
 	test_cmp expect dir
 '
 
+test_expect_success 'set sparse-checkout using builtin' '
+	git -C repo sparse-checkout set "/*" "!/*/" "*folder*" &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+		*folder*
+	EOF
+	git -C repo sparse-checkout list >actual &&
+	test_cmp expect actual &&
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v4 05/17] sparse-checkout: add '--stdin' option to set subcommand
  2019-10-15 13:55     ` [PATCH v4 " Derrick Stolee via GitGitGadget
                         ` (3 preceding siblings ...)
  2019-10-15 13:55       ` [PATCH v4 04/17] sparse-checkout: 'set' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-15 13:55       ` Derrick Stolee via GitGitGadget
  2019-10-15 13:55       ` [PATCH v4 06/17] sparse-checkout: create 'disable' subcommand Derrick Stolee via GitGitGadget
                         ` (14 subsequent siblings)
  19 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-15 13:55 UTC (permalink / raw)
  To: git; +Cc: newren, Derrick Stolee, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The 'git sparse-checkout set' subcommand takes a list of patterns
and places them in the sparse-checkout file. Then, it updates the
working directory to match those patterns. For a large list of
patterns, the command-line call can get very cumbersome.

Add a '--stdin' option to instead read patterns over standard in.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 builtin/sparse-checkout.c          | 40 ++++++++++++++++++++++++++++--
 t/t1091-sparse-checkout-builtin.sh | 27 ++++++++++++++++++++
 2 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 2103cbe00c..b747b78d34 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -145,6 +145,11 @@ static int write_patterns_and_update(struct pattern_list *pl)
 	char *sparse_filename;
 	FILE *fp;
 
+	if (!core_apply_sparse_checkout) {
+		warning(_("core.sparseCheckout is disabled, so changes to the sparse-checkout file will have no effect"));
+		warning(_("run 'git sparse-checkout init' to enable the sparse-checkout feature"));
+	}
+
 	sparse_filename = get_sparse_checkout_filename();
 	fp = fopen(sparse_filename, "w");
 	write_patterns_to_file(fp, pl);
@@ -154,16 +159,47 @@ static int write_patterns_and_update(struct pattern_list *pl)
 	return update_working_directory();
 }
 
+static char const * const builtin_sparse_checkout_set_usage[] = {
+	N_("git sparse-checkout set [--stdin|<patterns>]"),
+	NULL
+};
+
+static struct sparse_checkout_set_opts {
+	int use_stdin;
+} set_opts;
+
 static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
 {
 	static const char *empty_base = "";
 	int i;
 	struct pattern_list pl;
 	int result;
+
+	static struct option builtin_sparse_checkout_set_options[] = {
+		OPT_BOOL(0, "stdin", &set_opts.use_stdin,
+			 N_("read patterns from standard in")),
+		OPT_END(),
+	};
+
 	memset(&pl, 0, sizeof(pl));
 
-	for (i = 1; i < argc; i++)
-		add_pattern(argv[i], empty_base, 0, &pl, 0);
+	argc = parse_options(argc, argv, prefix,
+			     builtin_sparse_checkout_set_options,
+			     builtin_sparse_checkout_set_usage,
+			     PARSE_OPT_KEEP_UNKNOWN);
+
+	if (set_opts.use_stdin) {
+		struct strbuf line = STRBUF_INIT;
+
+		while (!strbuf_getline(&line, stdin)) {
+			size_t len;
+			char *buf = strbuf_detach(&line, &len);
+			add_pattern(buf, empty_base, 0, &pl, 0);
+		}
+	} else {
+		for (i = 0; i < argc; i++)
+			add_pattern(argv[i], empty_base, 0, &pl, 0);
+	}
 
 	result = write_patterns_and_update(&pl);
 
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 19e8673c6b..2a0137fde3 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -101,6 +101,13 @@ test_expect_success 'clone --sparse' '
 	test_cmp expect dir
 '
 
+test_expect_success 'warn if core.sparseCheckout is disabled' '
+	test_when_finished git -C repo config --worktree core.sparseCheckout true &&
+	git -C repo config --worktree core.sparseCheckout false &&
+	git -C repo sparse-checkout set folder1 2>err &&
+	test_i18ngrep "core.sparseCheckout is disabled" err
+'
+
 test_expect_success 'set sparse-checkout using builtin' '
 	git -C repo sparse-checkout set "/*" "!/*/" "*folder*" &&
 	cat >expect <<-EOF &&
@@ -120,4 +127,24 @@ test_expect_success 'set sparse-checkout using builtin' '
 	test_cmp expect dir
 '
 
+test_expect_success 'set sparse-checkout using --stdin' '
+	cat >expect <<-EOF &&
+		/*
+		!/*/
+		/folder1/
+		/folder2/
+	EOF
+	git -C repo sparse-checkout set --stdin <expect &&
+	git -C repo sparse-checkout list >actual &&
+	test_cmp expect actual &&
+	test_cmp expect repo/.git/info/sparse-checkout &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v4 06/17] sparse-checkout: create 'disable' subcommand
  2019-10-15 13:55     ` [PATCH v4 " Derrick Stolee via GitGitGadget
                         ` (4 preceding siblings ...)
  2019-10-15 13:55       ` [PATCH v4 05/17] sparse-checkout: add '--stdin' option to set subcommand Derrick Stolee via GitGitGadget
@ 2019-10-15 13:55       ` Derrick Stolee via GitGitGadget
  2019-10-18 16:31         ` SZEDER Gábor
  2019-10-15 13:55       ` [PATCH v4 07/17] trace2: add region in clear_ce_flags Jeff Hostetler via GitGitGadget
                         ` (13 subsequent siblings)
  19 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-15 13:55 UTC (permalink / raw)
  To: git; +Cc: newren, Derrick Stolee, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The instructions for disabling a sparse-checkout to a full
working directory are complicated and non-intuitive. Add a
subcommand, 'git sparse-checkout disable', to perform those
steps for the user.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-sparse-checkout.txt | 27 ++++++++++++---------------
 builtin/sparse-checkout.c             | 26 +++++++++++++++++++++++++-
 t/t1091-sparse-checkout-builtin.sh    | 15 +++++++++++++++
 3 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index 163f6c8db0..b0c141b582 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -45,6 +45,10 @@ To avoid interfering with other worktrees, it first enables the
 	a list of arguments following the 'set' subcommand. Update the
 	working directory to match the new patterns.
 
+'disable'::
+	Remove the sparse-checkout file, set `core.sparseCheckout` to
+	`false`, and restore the working directory to include all files.
+
 SPARSE CHECKOUT
 ----------------
 
@@ -62,6 +66,14 @@ directory, it updates the skip-worktree bits in the index based
 on this file. The files matching the patterns in the file will
 appear in the working directory, and the rest will not.
 
+To enable the sparse-checkout feature, run `git sparse-checkout init` to
+initialize a simple sparse-checkout file and enable the `core.sparseCheckout`
+config setting. Then, run `git sparse-checkout set` to modify the patterns in
+the sparse-checkout file.
+
+To repopulate the working directory with all files, use the
+`git sparse-checkout disable` command.
+
 ## FULL PATTERN SET
 
 By default, the sparse-checkout file uses the same syntax as `.gitignore`
@@ -76,21 +88,6 @@ using negative patterns. For example, to remove the file `unwanted`:
 !unwanted
 ----------------
 
-Another tricky thing is fully repopulating the working directory when you
-no longer want sparse checkout. You cannot just disable "sparse
-checkout" because skip-worktree bits are still in the index and your working
-directory is still sparsely populated. You should re-populate the working
-directory with the `$GIT_DIR/info/sparse-checkout` file content as
-follows:
-
-----------------
-/*
-----------------
-
-Then you can disable sparse checkout. Sparse checkout support in 'git
-checkout' and similar commands is disabled by default. You need to
-set `core.sparseCheckout` to `true` in order to have sparse checkout
-support.
 
 SEE ALSO
 --------
diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index b747b78d34..78a80ce119 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -8,7 +8,7 @@
 #include "strbuf.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
-	N_("git sparse-checkout [init|list|set] <options>"),
+	N_("git sparse-checkout [init|list|set|disable] <options>"),
 	NULL
 };
 
@@ -207,6 +207,28 @@ static int sparse_checkout_set(int argc, const char **argv, const char *prefix)
 	return result;
 }
 
+static int sparse_checkout_disable(int argc, const char **argv)
+{
+	char *sparse_filename;
+	FILE *fp;
+
+	if (sc_set_config(MODE_ALL_PATTERNS))
+		die(_("failed to change config"));
+
+	sparse_filename = get_sparse_checkout_filename();
+	fp = fopen(sparse_filename, "w");
+	fprintf(fp, "/*\n");
+	fclose(fp);
+
+	if (update_working_directory())
+		die(_("error while refreshing working directory"));
+
+	unlink(sparse_filename);
+	free(sparse_filename);
+
+	return sc_set_config(MODE_NO_PATTERNS);
+}
+
 int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 {
 	static struct option builtin_sparse_checkout_options[] = {
@@ -231,6 +253,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
 			return sparse_checkout_init(argc, argv);
 		if (!strcmp(argv[0], "set"))
 			return sparse_checkout_set(argc, argv, prefix);
+		if (!strcmp(argv[0], "disable"))
+			return sparse_checkout_disable(argc, argv);
 	}
 
 	usage_with_options(builtin_sparse_checkout_usage,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 2a0137fde3..52d24c66ba 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -147,4 +147,19 @@ test_expect_success 'set sparse-checkout using --stdin' '
 	test_cmp expect dir
 '
 
+test_expect_success 'sparse-checkout disable' '
+	git -C repo sparse-checkout disable &&
+	test_path_is_missing repo/.git/info/sparse-checkout &&
+	git -C repo config --list >config &&
+	test_i18ngrep "core.sparsecheckout=false" config &&
+	ls repo >dir &&
+	cat >expect <<-EOF &&
+		a
+		deep
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_done
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v4 07/17] trace2: add region in clear_ce_flags
  2019-10-15 13:55     ` [PATCH v4 " Derrick Stolee via GitGitGadget
                         ` (5 preceding siblings ...)
  2019-10-15 13:55       ` [PATCH v4 06/17] sparse-checkout: create 'disable' subcommand Derrick Stolee via GitGitGadget
@ 2019-10-15 13:55       ` Jeff Hostetler via GitGitGadget
  2019-10-15 13:55       ` [PATCH v4 08/17] sparse-checkout: add 'cone' mode Derrick Stolee via GitGitGadget
                         ` (12 subsequent siblings)
  19 siblings, 0 replies; 196+ messages in thread
From: Jeff Hostetler via GitGitGadget @ 2019-10-15 13:55 UTC (permalink / raw)
  To: git; +Cc: newren, Derrick Stolee, Junio C Hamano, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

When Git updates the working directory with the sparse-checkout
feature enabled, the unpack_trees() method calls clear_ce_flags()
to update the skip-wortree bits on the cache entries. This
check can be expensive, depending on the patterns used.

Add trace2 regions around the method, including some flag
information, so we can get granular performance data during
experiments. This data will be used to measure improvements
to the pattern-matching algorithms for sparse-checkout.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 unpack-trees.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/unpack-trees.c b/unpack-trees.c
index 33ea7810d8..01a05ff66d 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1407,15 +1407,23 @@ static int clear_ce_flags(struct index_state *istate,
 			  struct pattern_list *pl)
 {
 	static struct strbuf prefix = STRBUF_INIT;
+	char label[100];
+	int rval;
 
 	strbuf_reset(&prefix);
 
-	return clear_ce_flags_1(istate,
+	xsnprintf(label, sizeof(label), "clear_ce_flags(0x%08lx,0x%08lx)",
+		  (unsigned long)select_mask, (unsigned long)clear_mask);
+	trace2_region_enter("unpack_trees", label, the_repository);
+	rval = clear_ce_flags_1(istate,
 				istate->cache,
 				istate->cache_nr,
 				&prefix,
 				select_mask, clear_mask,
 				pl, 0);
+	trace2_region_leave("unpack_trees", label, the_repository);
+
+	return rval;
 }
 
 /*
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v4 08/17] sparse-checkout: add 'cone' mode
  2019-10-15 13:55     ` [PATCH v4 " Derrick Stolee via GitGitGadget
                         ` (6 preceding siblings ...)
  2019-10-15 13:55       ` [PATCH v4 07/17] trace2: add region in clear_ce_flags Jeff Hostetler via GitGitGadget
@ 2019-10-15 13:55       ` Derrick Stolee via GitGitGadget
  2019-10-15 13:55       ` [PATCH v4 09/17] sparse-checkout: use hashmaps for cone patterns Derrick Stolee via GitGitGadget
                         ` (11 subsequent siblings)
  19 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-15 13:55 UTC (permalink / raw)
  To: git; +Cc: newren, Derrick Stolee, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The sparse-checkout feature can have quadratic performance as
the number of patterns and number of entries in the index grow.
If there are 1,000 patterns and 1,000,000 entries, this time can
be very significant.

Create a new Boolean config option, core.sparseCheckoutCone, to
indicate that we expect the sparse-checkout file to contain a
more limited set of patterns. This is a separate config setting
from core.sparseCheckout to avoid breaking older clients by
introducing a tri-state option.

The config option does nothing right now, but will be expanded
upon in a later commit.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/config/core.txt         | 10 ++++--
 Documentation/git-sparse-checkout.txt | 50 +++++++++++++++++++++++++++
 cache.h                               |  4 ++-
 config.c                              |  5 +++
 environment.c                         |  1 +
 t/t1091-sparse-checkout-builtin.sh    | 14 ++++++++
 6 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index 852d2ba37a..bdbbee58b9 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -593,8 +593,14 @@ core.multiPackIndex::
 	multi-pack-index design document].
 
 core.sparseCheckout::
-	Enable "sparse checkout" feature. See section "Sparse checkout" in
-	linkgit:git-read-tree[1] for more information.
+	Enable "sparse checkout" feature. See linkgit:git-sparse-checkout[1]
+	for more information.
+
+core.sparseCheckoutCone::
+	Enables the "cone mode" of the sparse checkout feature. When the
+	sparse-checkout file contains a limited set of patterns, then this
+	mode provides significant performance advantages. See
+	linkgit:git-sparse-checkout[1] for more information.
 
 core.abbrev::
 	Set the length object names are abbreviated to.  If
diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt
index b0c141b582..bb881aaa8e 100644
--- a/Documentation/git-sparse-checkout.txt
+++ b/Documentation/git-sparse-checkout.txt
@@ -89,6 +89,56 @@ using negative patterns. For example, to remove the file `unwanted`:
 ----------------
 
 
+## CONE PATTERN SET
+
+The full pattern set allows for arbitrary pattern matches and complicated
+inclusion/exclusion rules. These can result in O(N*M) pattern matches when
+updating the index, where N is the number of patterns and M is the number
+of paths in the index. To combat this performance issue, a more restricted
+pattern set is allowed when `core.spareCheckoutCone` is enabled.
+
+The accepted patterns in the cone pattern set are:
+
+1. *Recursive:* All paths inside a directory are included.
+
+2. *Parent:* All files immediately inside a directory are included.
+
+In addition to the above two patterns, we also expect that all files in the
+root directory are included. If a recursive pattern is added, then all
+leading directories are added as parent patterns.
+
+By default, when running `git sparse-checkout init`, the root directory is
+added as a parent pattern. At this point, the sparse-checkout file contains
+the following patterns:
+
+```
+/*
+!/*/
+```
+
+This says "include everything in root, but nothing two levels below root."
+If we then add the folder `A/B/C` as a recursive pattern, the folders `A` and
+`A/B` are added as parent patterns. The resulting sparse-checkout file is
+now
+
+```
+/*
+!/*/
+/A/
+!/A/*/
+/A/B/
+!/A/B/*/
+/A/B/C/
+```
+
+Here, order matters, so the negative patterns are overridden by the positive
+patterns that appear lower in the file.
+
+If `core.sparseCheckoutCone=true`, then Git will parse the sparse-checkout file
+expecting patterns of these types. Git will warn if the patterns do not match.
+If the patterns do match the expected format, then Git will use faster hash-
+based algorithms to compute inclusion in the sparse-checkout.
+
 SEE ALSO
 --------
 
diff --git a/cache.h b/cache.h
index 04cabaac11..4980ee198e 100644
--- a/cache.h
+++ b/cache.h
@@ -918,12 +918,14 @@ extern char *git_replace_ref_base;
 
 extern int fsync_object_files;
 extern int core_preload_index;
-extern int core_apply_sparse_checkout;
 extern int precomposed_unicode;
 extern int protect_hfs;
 extern int protect_ntfs;
 extern const char *core_fsmonitor;
 
+int core_apply_sparse_checkout;
+int core_sparse_checkout_cone;
+
 /*
  * Include broken refs in all ref iterations, which will
  * generally choke dangerous operations rather than letting
diff --git a/config.c b/config.c
index e7052b3977..d75f88ca0c 100644
--- a/config.c
+++ b/config.c
@@ -1364,6 +1364,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "core.sparsecheckoutcone")) {
+		core_sparse_checkout_cone = git_config_bool(var, value);
+		return 0;
+	}
+
 	if (!strcmp(var, "core.precomposeunicode")) {
 		precomposed_unicode = git_config_bool(var, value);
 		return 0;
diff --git a/environment.c b/environment.c
index efa072680a..2a1a866659 100644
--- a/environment.c
+++ b/environment.c
@@ -67,6 +67,7 @@ enum object_creation_mode object_creation_mode = OBJECT_CREATION_MODE;
 char *notes_ref_name;
 int grafts_replace_parents = 1;
 int core_apply_sparse_checkout;
+int core_sparse_checkout_cone;
 int merge_log_config = -1;
 int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */
 unsigned long pack_size_limit_cfg;
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 52d24c66ba..36fda5907b 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -147,6 +147,20 @@ test_expect_success 'set sparse-checkout using --stdin' '
 	test_cmp expect dir
 '
 
+test_expect_success 'cone mode: match patterns' '
+	git -C repo config --worktree core.sparseCheckoutCone true &&
+	rm -rf repo/a repo/folder1 repo/folder2 &&
+	git -C repo read-tree -mu HEAD &&
+	git -C repo reset --hard &&
+	ls repo >dir  &&
+	cat >expect <<-EOF &&
+		a
+		folder1
+		folder2
+	EOF
+	test_cmp expect dir
+'
+
 test_expect_success 'sparse-checkout disable' '
 	git -C repo sparse-checkout disable &&
 	test_path_is_missing repo/.git/info/sparse-checkout &&
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v4 09/17] sparse-checkout: use hashmaps for cone patterns
  2019-10-15 13:55     ` [PATCH v4 " Derrick Stolee via GitGitGadget
                         ` (7 preceding siblings ...)
  2019-10-15 13:55       ` [PATCH v4 08/17] sparse-checkout: add 'cone' mode Derrick Stolee via GitGitGadget
@ 2019-10-15 13:55       ` Derrick Stolee via GitGitGadget
  2019-10-18 15:31         ` SZEDER Gábor
  2019-10-15 13:55       ` [PATCH v4 10/17] sparse-checkout: init and set in cone mode Derrick Stolee via GitGitGadget
                         ` (10 subsequent siblings)
  19 siblings, 1 reply; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-15 13:55 UTC (permalink / raw)
  To: git; +Cc: newren, Derrick Stolee, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

The parent and recursive patterns allowed by the "cone mode"
option in sparse-checkout are restrictive enough that we
can avoid using the regex parsing. Everything is based on
prefix matches, so we can use hashsets to store the prefixes
from the sparse-checkout file. When checking a path, we can
strip path entries from the path and check the hashset for
an exact match.

As a test, I created a cone-mode sparse-checkout file for the
Linux repository that actually includes every file. This was
constructed by taking every folder in the Linux repo and creating
the pattern pairs here:

	/$folder/
	!/$folder/*/

This resulted in a sparse-checkout file sith 8,296 patterns.
Running 'git read-tree -mu HEAD' on this file had the following
performance:

	core.sparseCheckout=false: 0.21 s (0.00 s)
	 core.sparseCheckout=true: 3.75 s (3.50 s)
	 core.sparseCheckout=cone: 0.23 s (0.01 s)

The times in parentheses above correspond to the time spent
in the first clear_ce_flags() call, according to the trace2
performance traces.

While this example is contrived, it demonstrates how these
patterns can slow the sparse-checkout feature.

Helped-by: Eric Wong <e@80x24.org>
Helped-by: Johannes Schindelin <Johannes.Schindelin@gmx.de>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 dir.c                              | 207 +++++++++++++++++++++++++++--
 dir.h                              |  31 +++++
 t/t1091-sparse-checkout-builtin.sh |  11 +-
 unpack-trees.c                     |   1 +
 4 files changed, 241 insertions(+), 9 deletions(-)

diff --git a/dir.c b/dir.c
index 61f559f980..dfabf9982f 100644
--- a/dir.c
+++ b/dir.c
@@ -611,6 +611,150 @@ void parse_path_pattern(const char **pattern,
 	*patternlen = len;
 }
 
+static int pl_hashmap_cmp(const void *unused_cmp_data,
+			  const struct hashmap_entry *a,
+			  const struct hashmap_entry *b,
+			  const void *key)
+{
+	const struct pattern_entry *ee1 =
+			container_of(a, struct pattern_entry, ent);
+	const struct pattern_entry *ee2 =
+			container_of(b, struct pattern_entry, ent);
+
+	size_t min_len = ee1->patternlen <= ee2->patternlen
+			 ? ee1->patternlen
+			 : ee2->patternlen;
+
+	return strncmp(ee1->pattern, ee2->pattern, min_len);
+}
+
+static void add_pattern_to_hashsets(struct pattern_list *pl, struct path_pattern *given)
+{
+	struct pattern_entry *translated;
+	char *truncated;
+	char *data = NULL;
+
+	if (!pl->use_cone_patterns)
+		return;
+
+	if (given->flags & PATTERN_FLAG_NEGATIVE &&
+	    given->flags & PATTERN_FLAG_MUSTBEDIR &&
+	    !strcmp(given->pattern, "/*")) {
+		pl->full_cone = 0;
+		return;
+	}
+
+	if (!given->flags && !strcmp(given->pattern, "/*")) {
+		pl->full_cone = 1;
+		return;
+	}
+
+	if (given->patternlen > 2 &&
+	    !strcmp(given->pattern + given->patternlen - 2, "/*")) {
+		if (!(given->flags & PATTERN_FLAG_NEGATIVE)) {
+			/* Not a cone pattern. */
+			pl->use_cone_patterns = 0;
+			warning(_("unrecognized pattern: '%s'"), given->pattern);
+			goto clear_hashmaps;
+		}
+
+		truncated = xstrdup(given->pattern);
+		truncated[given->patternlen - 2] = 0;
+
+		translated = xmalloc(sizeof(struct pattern_entry));
+		translated->pattern = truncated;
+		translated->patternlen = given->patternlen - 2;
+		hashmap_entry_init(&translated->ent,
+				   memhash(translated->pattern, translated->patternlen));
+
+		if (!hashmap_get_entry(&pl->recursive_hashmap,
+				       translated, ent, NULL)) {
+			/* We did not see the "parent" included */
+			warning(_("unrecognized negative pattern: '%s'"),
+				given->pattern);
+			free(truncated);
+			free(translated);
+			goto clear_hashmaps;
+		}
+
+		hashmap_add(&pl->parent_hashmap, &translated->ent);
+		hashmap_remove(&pl->recursive_hashmap, &translated->ent, &data);
+		free(data);
+		return;
+	}
+
+	if (given->flags & PATTERN_FLAG_NEGATIVE) {
+		warning(_("unrecognized negative pattern: '%s'"),
+			given->pattern);
+		goto clear_hashmaps;
+	}
+
+	translated = xmalloc(sizeof(struct pattern_entry));
+
+	translated->pattern = xstrdup(given->pattern);
+	translated->patternlen = given->patternlen;
+	hashmap_entry_init(&translated->ent,
+			   memhash(translated->pattern, translated->patternlen));
+
+	hashmap_add(&pl->recursive_hashmap, &translated->ent);
+
+	if (hashmap_get_entry(&pl->parent_hashmap, translated, ent, NULL)) {
+		/* we already included this at the parent level */
+		warning(_("your sparse-checkout file may have issues: pattern '%s' is repeated"),
+			given->pattern);
+		hashmap_remove(&pl->parent_hashmap, &translated->ent, &data);
+		free(data);
+		free(translated);
+	}
+
+	return;
+
+clear_hashmaps:
+	warning(_("disabling cone pattern matching"));
+	hashmap_free_entries(&pl->parent_hashmap, struct pattern_entry, ent);
+	hashmap_free_entries(&pl->recursive_hashmap, struct pattern_entry, ent);
+	pl->use_cone_patterns = 0;
+}
+
+static int hashmap_contains_path(struct hashmap *map,
+				 struct strbuf *pattern)
+{
+	struct pattern_entry p;
+
+	/* Check straight mapping */
+	p.pattern = pattern->buf;
+	p.patternlen = pattern->len;
+	hashmap_entry_init(&p.ent, memhash(p.pattern, p.patternlen));
+	return !!hashmap_get_entry(map, &p, ent, NULL);
+}
+
+int hashmap_contains_parent(struct hashmap *map,
+			    const char *path,
+			    struct strbuf *buffer)
+{
+	char *slash_pos;
+
+	strbuf_setlen(buffer, 0);
+
+	if (path[0] != '/')
+		strbuf_addch(buffer, '/');
+
+	strbuf_addstr(buffer, path);
+
+	slash_pos = strrchr(buffer->buf, '/');
+
+	while (slash_pos > buffer->buf) {
+		strbuf_setlen(buffer, slash_pos - buffer->buf);
+
+		if (hashmap_contains_path(map, buffer))
+			return 1;
+
+		slash_pos = strrchr(buffer->buf, '/');
+	}
+
+	return 0;
+}
+
 void add_pattern(const char *string, const char *base,
 		 int baselen, struct pattern_list *pl, int srcpos)
 {
@@ -635,6 +779,8 @@ void add_pattern(const char *string, const char *base,
 	ALLOC_GROW(pl->patterns, pl->nr + 1, pl->alloc);
 	pl->patterns[pl->nr++] = pattern;
 	pattern->pl = pl;
+
+	add_pattern_to_hashsets(pl, pattern);
 }
 
 static int read_skip_worktree_file_from_index(const struct index_state *istate,
@@ -860,6 +1006,9 @@ static int add_patterns_from_buffer(char *buf, size_t size,
 	int i, lineno = 1;
 	char *entry;
 
+	hashmap_init(&pl->recursive_hashmap, pl_hashmap_cmp, NULL, 0);
+	hashmap_init(&pl->parent_hashmap, pl_hashmap_cmp, NULL, 0);
+
 	pl->filebuf = buf;
 
 	if (skip_utf8_bom(&buf, size))
@@ -1096,16 +1245,58 @@ enum pattern_match_result path_matches_pattern_list(
 				struct index_state *istate)
 {
 	struct path_pattern *pattern;
-	pattern = last_matching_pattern_from_list(pathname, pathlen, basename,
-						  dtype, pl, istate);
-	if (pattern) {
-		if (pattern->flags & PATTERN_FLAG_NEGATIVE)
-			return NOT_MATCHED;
-		else
-			return MATCHED;
+	struct strbuf parent_pathname = STRBUF_INIT;
+	int result = NOT_MATCHED;
+	const char *slash_pos;
+
+	if (!pl->use_cone_patterns) {
+		pattern = last_matching_pattern_from_list(pathname, pathlen, basename,
+							dtype, pl, istate);
+		if (pattern) {
+			if (pattern->flags & PATTERN_FLAG_NEGATIVE)
+				return NOT_MATCHED;
+			else
+				return MATCHED;
+		}
+
+		return UNDECIDED;
+	}
+
+	if (pl->full_cone)
+		return MATCHED;
+
+	strbuf_addch(&parent_pathname, '/');
+	strbuf_add(&parent_pathname, pathname, pathlen);
+
+	if (hashmap_contains_path(&pl->recursive_hashmap,
+				  &parent_pathname)) {
+		result = MATCHED;
+		goto done;
+	}
+
+	slash_pos = strrchr(parent_pathname.buf, '/');
+
+	if (slash_pos == parent_pathname.buf) {
+		/* include every file in root */
+		result = MATCHED;
+		goto done;
 	}
 
-	return UNDECIDED;
+	strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf);
+
+	if (hashmap_contains_path(&pl->parent_hashmap, &parent_pathname)) {
+		result = MATCHED;
+		goto done;
+	}
+
+	if (hashmap_contains_parent(&pl->recursive_hashmap,
+				    pathname,
+				    &parent_pathname))
+		result = MATCHED;
+
+done:
+	strbuf_release(&parent_pathname);
+	return result;
 }
 
 static struct path_pattern *last_matching_pattern_from_lists(
diff --git a/dir.h b/dir.h
index 2fbdef014f..f8edbca72b 100644
--- a/dir.h
+++ b/dir.h
@@ -4,6 +4,7 @@
 /* See Documentation/technical/api-directory-listing.txt */
 
 #include "cache.h"
+#include "hashmap.h"
 #include "strbuf.h"
 
 struct dir_entry {
@@ -37,6 +38,13 @@ struct path_pattern {
 	int srcpos;
 };
 
+/* used for hashmaps for cone patterns */
+struct pattern_entry {
+	struct hashmap_entry ent;
+	char *pattern;
+	size_t patternlen;
+};
+
 /*
  * Each excludes file will be parsed into a fresh exclude_list which
  * is appended to the relevant exclude_list_group (either EXC_DIRS or
@@ -55,6 +63,26 @@ struct pattern_list {
 	const char *src;
 
 	struct path_pattern **patterns;
+
+	/*
+	 * While scanning the excludes, we attempt to match the patterns
+	 * with a more restricted set that allows us to use hashsets for
+	 * matching logic, which is faster than the linear lookup in the
+	 * excludes array above. If non-zero, that check succeeded.
+	 */
+	unsigned use_cone_patterns;
+	unsigned full_cone;
+
+	/*
+	 * Stores paths where everything starting with those paths
+	 * is included.
+	 */
+	struct hashmap recursive_hashmap;
+
+	/*
+	 * Used to check single-level parents of blobs.
+	 */
+	struct hashmap parent_hashmap;
 };
 
 /*
@@ -271,6 +299,9 @@ int is_excluded(struct dir_struct *dir,
 		struct index_state *istate,
 		const char *name, int *dtype);
 
+int hashmap_contains_parent(struct hashmap *map,
+			    const char *path,
+			    struct strbuf *buffer);
 struct pattern_list *add_pattern_list(struct dir_struct *dir,
 				      int group_type, const char *src);
 int add_patterns_from_file_to_list(const char *fname, const char *base, int baselen,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 36fda5907b..b0d5aeb33a 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -150,7 +150,8 @@ test_expect_success 'set sparse-checkout using --stdin' '
 test_expect_success 'cone mode: match patterns' '
 	git -C repo config --worktree core.sparseCheckoutCone true &&
 	rm -rf repo/a repo/folder1 repo/folder2 &&
-	git -C repo read-tree -mu HEAD &&
+	git -C repo read-tree -mu HEAD 2>err &&
+	test_i18ngrep ! "disabling cone patterns" err &&
 	git -C repo reset --hard &&
 	ls repo >dir  &&
 	cat >expect <<-EOF &&
@@ -161,6 +162,14 @@ test_expect_success 'cone mode: match patterns' '
 	test_cmp expect dir
 '
 
+test_expect_success 'cone mode: warn on bad pattern' '
+	test_when_finished mv sparse-checkout repo/.git/info/ &&
+	cp repo/.git/info/sparse-checkout . &&
+	echo "!/deep/deeper/*" >>repo/.git/info/sparse-checkout &&
+	git -C repo read-tree -mu HEAD 2>err &&
+	test_i18ngrep "unrecognized negative pattern" err
+'
+
 test_expect_success 'sparse-checkout disable' '
 	git -C repo sparse-checkout disable &&
 	test_path_is_missing repo/.git/info/sparse-checkout &&
diff --git a/unpack-trees.c b/unpack-trees.c
index 01a05ff66d..a90d71845d 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1482,6 +1482,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 		o->skip_sparse_checkout = 1;
 	if (!o->skip_sparse_checkout) {
 		char *sparse = git_pathdup("info/sparse-checkout");
+		pl.use_cone_patterns = core_sparse_checkout_cone;
 		if (add_patterns_from_file_to_list(sparse, "", 0, &pl, NULL) < 0)
 			o->skip_sparse_checkout = 1;
 		else
-- 
gitgitgadget


^ permalink raw reply	[flat|nested] 196+ messages in thread

* [PATCH v4 10/17] sparse-checkout: init and set in cone mode
  2019-10-15 13:55     ` [PATCH v4 " Derrick Stolee via GitGitGadget
                         ` (8 preceding siblings ...)
  2019-10-15 13:55       ` [PATCH v4 09/17] sparse-checkout: use hashmaps for cone patterns Derrick Stolee via GitGitGadget
@ 2019-10-15 13:55       ` Derrick Stolee via GitGitGadget
  2019-10-15 13:55       ` [PATCH v4 11/17] unpack-trees: hash less " Derrick Stolee via GitGitGadget
                         ` (9 subsequent siblings)
  19 siblings, 0 replies; 196+ messages in thread
From: Derrick Stolee via GitGitGadget @ 2019-10-15 13:55 UTC (permalink / raw)
  To: git; +Cc: newren, Derrick Stolee, Junio C Hamano, Derrick Stolee

From: Derrick Stolee <dstolee@microsoft.com>

To make the cone pattern set easy to use, update the behavior of
'git sparse-checkout [init|set]'.

Add '--cone' flag to 'git sparse-checkout init' to set the config
option 'core.sparseCheckoutCone=true'.

When running 'git sparse-checkout set' in cone mode, a user only
needs to supply a list of recursive folder matches. Git will
automatically add the necessary parent matches for the leading
directories.

When testing 'git sparse-checkout set' in cone mode, check the
error stream to ensure we do not see any errors. Specifically,
we want to avoid the warning that the patterns do not match
the cone-mode patterns.

Helped-by: Eric Wong <e@80x24.org>
Helped-by: Johannes Schindelin <Johannes.Schindelin@gmx.de>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 builtin/sparse-checkout.c          | 159 +++++++++++++++++++++++++++--
 dir.c                              |   8 +-
 dir.h                              |   4 +
 t/t1091-sparse-checkout-builtin.sh |  51 +++++++++
 4 files changed, 208 insertions(+), 14 deletions(-)

diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c
index 78a80ce119..95d9a90b84 100644
--- a/builtin/sparse-checkout.c
+++ b/builtin/sparse-checkout.c
@@ -6,6 +6,7 @@
 #include "repository.h"
 #include "run-command.h"
 #include "strbuf.h"
+#include "string-list.h"
 
 static char const * const builtin_sparse_checkout_usage[] = {
 	N_("git sparse-checkout [init|list|set|disable] <options>"),
@@ -77,11 +78,13 @@ static int update_working_directory(void)
 enum sparse_checkout_mode {
 	MODE_NO_PATTERNS = 0,
 	MODE_ALL_PATTERNS = 1,
+	MODE_CONE_PATTERNS = 2,
 };
 
 static int sc_set_config(enum sparse_checkout_mode mode)
 {
 	struct argv_array argv = ARGV_ARRAY_INIT;
+	struct argv_array cone_argv = ARGV_ARRAY_INIT;
 
 	if (git_config_set_gently("extensions.worktreeConfig", "true")) {
 		error(_("failed to set extensions.worktreeConfig setting"));
@@ -100,9 +103,31 @@ static int sc_set_config(enum sparse_checkout_mode mode)
 		return 1;
 	}
 
+	argv_array_pushl(&cone_argv, "config", "--worktree",
+			 "core.sparseCheckoutCone", NULL);
+
+	if (mode == MODE_CONE_PATTERNS)
+		argv_array_push(&cone_argv, "true");
+	else
+		argv_array_push(&cone_argv, "false");
+
+	if (run_command_v_opt(cone_argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to enable core.sparseCheckoutCone"));
+		return 1;
+	}
+
 	return 0;
 }
 
+static char const * const builtin_sparse_checkout_init_usage[] = {
+	N_("git sparse-checkout init [--cone]"),
+	NULL
+};
+
+static struct sparse_checkout_init_opts {
+	int cone_mode;
+} init_opts;
+
 static int sparse_checkout_init(int argc, const char **argv)
 {
 	struct pattern_list pl;
@@ -110,8 +135,21 @@ static int sparse_checkout_init(int argc, const char **argv)
 	FILE *fp;
 	int res;
 	struct object_id oid;
+	int mode;
 
-	if (sc_set_config(MODE_ALL_PATTERNS))
+	static struct option builtin_sparse_checkout_init_options[] = {
+		OPT_BOOL(0, "cone", &init_opts.cone_mode,
+			 N_("initialize the sparse-checkout in cone mode")),
+		OPT_END(),
+	};
+
+	argc = parse_options(argc, argv, NULL,
+			     builtin_sparse_checkout_init_options,
+			     builtin_sparse_checkout_init_usage, 0);
+
+	mode = init_opts.cone_mode ? MODE_CONE_PATTERNS : MODE_ALL_PATTERNS;
+
+	if (sc_set_config(mode))
 		return 1;
 
 	memset(&pl, 0, sizeof(pl));
@@ -140,6 +178,70 @@ static int sparse_checkout_init(int argc, const char **argv)
 	return update_working_directory();
 }
 
+static void insert_recursive_pattern(struct pattern_list *pl, struct strbuf *path)
+{
+	struct pattern_entry *e = xmalloc(sizeof(*e));
+	e->patternlen = path->len;
+	e->pattern = strbuf_detach(path, NULL);
+	hashmap_entry_init(&e->ent, memhash(e->pattern, e->patternlen));
+
+	hashmap_add(&pl->recursive_hashmap, &e->ent);
+
+	while (e->patternlen) {
+		char *slash = strrchr(e->pattern, '/');
+		char *oldpattern = e->pattern;
+		size_t newlen;
+
+		if (slash == e->pattern)
+			break;
+
+		newlen = slash - e->pattern;
+		e = xmalloc(sizeof(struct pattern_entry));
+		e->patternlen = newlen;
+		e->pattern = xstrndup(oldpattern, newlen);
+		hashmap_entry_init(&e->ent, memhash(e->pattern, e->patternlen));
+
+		if (!hashmap_get_entry(&pl->parent_hashmap, e, ent, NULL))
+			hashmap_add(&pl->parent_hashmap, &e->ent);
+	}
+}
+
+static void write_cone_to_file(FILE *fp, struct pattern_list *pl)
+{
+	int i;
+	struct pattern_entry *pe;
+	struct hashmap_iter iter;
+	struct string_list sl = STRING_LIST_INIT_DUP;
+
+	hashmap_for_each_entry(&pl->parent_hashmap, &iter, pe, ent)
+		string_list_insert(&sl, pe->pattern);
+
+	string_list_sort(&sl);
+	string_list_remove_duplicates(&sl, 0);
+
+	fprintf(fp, "/*\n!/*/\n");
+
+	for (i = 0; i < sl.nr; i++) {
+		char *pattern = sl.items[i].string;
+
+		if (strlen(pattern))
+			fprintf(fp, "%s/\n!%s/*/\n", pattern, pattern);
+	}
+
+	string_list_clear(&sl, 0);
+
+	hashmap_for_each_entry(&pl->recursive_hashmap, &iter, pe, ent)
+		string_list_insert(&sl, pe->pattern);
+
+	string_list_sort(&sl);
+	string_list_remove_duplicates(&sl, 0);
+
+	for (i = 0; i < sl.nr; i++) {
+		char *pattern = sl.items[i].string;
+		fprintf(fp, "%s/\n", pattern);
+	}
+}
+
 static int write_patterns_and_update(struct pattern_list *pl)
 {
 	char *sparse_filename;
@@ -152,13 +254,33 @@ static int write_patterns_and_update(struct pattern_list *pl)
 
 	sparse_filename = get_sparse_checkout_filename();
 	fp = fopen(sparse_filename, "w");
-	write_patterns_to_file(fp, pl);
+
+	if (core_sparse_checkout_cone)
+		write_cone_to_file(fp, pl);
+	else
+		write_patterns_to_file(fp, pl);
+
 	fclose(fp);
 	free(sparse_filename);
 
 	return u