git@vger.kernel.org list mirror (unofficial, one of many)
 help / color / mirror / code / Atom feed
* [PATCH v12 0/5] read-cache: speed up add_index_entry
@ 2017-04-19 17:06 git
  2017-04-19 17:06 ` [PATCH v12 1/5] read-cache: add strcmp_offset function git
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: git @ 2017-04-19 17:06 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Version 12 adds a new t/perf/repo/inflate-repo.sh script to let you
inflate a test repo, such as a copy of git.git or linux.git, to have
a branch containing a very large number of (non-synthetic) files.

It also fixes the "##" comments in the many-files.sh script
as mentioned on the mailing list.

I've also updated the commit message on part 2 to show the
results when run on an inflated copy of linux.git with 1M+ files.

Jeff Hostetler (5):
  read-cache: add strcmp_offset function
  p0006-read-tree-checkout: perf test to time read-tree
  read-cache: speed up add_index_entry during checkout
  read-cache: speed up has_dir_name (part 1)
  read-cache: speed up has_dir_name (part 2)

 Makefile                           |   1 +
 cache.h                            |   1 +
 read-cache.c                       | 139 ++++++++++++++++++++++++++++++++++++-
 t/helper/.gitignore                |   1 +
 t/helper/test-strcmp-offset.c      |  22 ++++++
 t/perf/p0006-read-tree-checkout.sh |  67 ++++++++++++++++++
 t/perf/repos/.gitignore            |   1 +
 t/perf/repos/inflate-repo.sh       |  86 +++++++++++++++++++++++
 t/perf/repos/many-files.sh         | 110 +++++++++++++++++++++++++++++
 t/t0065-strcmp-offset.sh           |  21 ++++++
 10 files changed, 447 insertions(+), 2 deletions(-)
 create mode 100644 t/helper/test-strcmp-offset.c
 create mode 100755 t/perf/p0006-read-tree-checkout.sh
 create mode 100644 t/perf/repos/.gitignore
 create mode 100755 t/perf/repos/inflate-repo.sh
 create mode 100755 t/perf/repos/many-files.sh
 create mode 100755 t/t0065-strcmp-offset.sh

-- 
2.9.3


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v12 1/5] read-cache: add strcmp_offset function
  2017-04-19 17:06 [PATCH v12 0/5] read-cache: speed up add_index_entry git
@ 2017-04-19 17:06 ` git
  2017-04-19 17:06 ` [PATCH v12 2/5] p0006-read-tree-checkout: perf test to time read-tree git
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: git @ 2017-04-19 17:06 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Add strcmp_offset() function to also return the offset of the
first change.

Add unit test and helper to verify.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 Makefile                      |  1 +
 cache.h                       |  1 +
 read-cache.c                  | 20 ++++++++++++++++++++
 t/helper/.gitignore           |  1 +
 t/helper/test-strcmp-offset.c | 22 ++++++++++++++++++++++
 t/t0065-strcmp-offset.sh      | 21 +++++++++++++++++++++
 6 files changed, 66 insertions(+)
 create mode 100644 t/helper/test-strcmp-offset.c
 create mode 100755 t/t0065-strcmp-offset.sh

diff --git a/Makefile b/Makefile
index 9ec6065..4c4c246 100644
--- a/Makefile
+++ b/Makefile
@@ -631,6 +631,7 @@ TEST_PROGRAMS_NEED_X += test-scrap-cache-tree
 TEST_PROGRAMS_NEED_X += test-sha1
 TEST_PROGRAMS_NEED_X += test-sha1-array
 TEST_PROGRAMS_NEED_X += test-sigchain
+TEST_PROGRAMS_NEED_X += test-strcmp-offset
 TEST_PROGRAMS_NEED_X += test-string-list
 TEST_PROGRAMS_NEED_X += test-submodule-config
 TEST_PROGRAMS_NEED_X += test-subprocess
diff --git a/cache.h b/cache.h
index 80b6372..3c55047 100644
--- a/cache.h
+++ b/cache.h
@@ -574,6 +574,7 @@ extern int write_locked_index(struct index_state *, struct lock_file *lock, unsi
 extern int discard_index(struct index_state *);
 extern int unmerged_index(const struct index_state *);
 extern int verify_path(const char *path);
+extern int strcmp_offset(const char *s1, const char *s2, size_t *first_change);
 extern int index_dir_exists(struct index_state *istate, const char *name, int namelen);
 extern void adjust_dirname_case(struct index_state *istate, char *name);
 extern struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int igncase);
diff --git a/read-cache.c b/read-cache.c
index 9054369..97f13a1 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -887,6 +887,26 @@ static int has_file_name(struct index_state *istate,
 	return retval;
 }
 
+
+/*
+ * Like strcmp(), but also return the offset of the first change.
+ * If strings are equal, return the length.
+ */
+int strcmp_offset(const char *s1, const char *s2, size_t *first_change)
+{
+	size_t k;
+
+	if (!first_change)
+		return strcmp(s1, s2);
+
+	for (k = 0; s1[k] == s2[k]; k++)
+		if (s1[k] == '\0')
+			break;
+
+	*first_change = k;
+	return (unsigned char)s1[k] - (unsigned char)s2[k];
+}
+
 /*
  * Do we have another file with a pathname that is a proper
  * subset of the name we're trying to add?
diff --git a/t/helper/.gitignore b/t/helper/.gitignore
index d6e8b36..0a89531 100644
--- a/t/helper/.gitignore
+++ b/t/helper/.gitignore
@@ -25,6 +25,7 @@
 /test-sha1
 /test-sha1-array
 /test-sigchain
+/test-strcmp-offset
 /test-string-list
 /test-submodule-config
 /test-subprocess
diff --git a/t/helper/test-strcmp-offset.c b/t/helper/test-strcmp-offset.c
new file mode 100644
index 0000000..4a45a54
--- /dev/null
+++ b/t/helper/test-strcmp-offset.c
@@ -0,0 +1,22 @@
+#include "cache.h"
+
+int cmd_main(int argc, const char **argv)
+{
+	int result;
+	size_t offset;
+
+	if (!argv[1] || !argv[2])
+		die("usage: %s <string1> <string2>", argv[0]);
+
+	result = strcmp_offset(argv[1], argv[2], &offset);
+
+	/*
+	 * Because differnt CRTs behave differently, only rely on signs
+	 * of the result values.
+	 */
+	result = (result < 0 ? -1 :
+			  result > 0 ? 1 :
+			  0);
+	printf("%d %"PRIuMAX"\n", result, (uintmax_t)offset);
+	return 0;
+}
diff --git a/t/t0065-strcmp-offset.sh b/t/t0065-strcmp-offset.sh
new file mode 100755
index 0000000..7d6d214
--- /dev/null
+++ b/t/t0065-strcmp-offset.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+test_description='Test strcmp_offset functionality'
+
+. ./test-lib.sh
+
+while read s1 s2 expect
+do
+	test_expect_success "strcmp_offset($s1, $s2)" '
+		echo "$expect" >expect &&
+		test-strcmp-offset "$s1" "$s2" >actual &&
+		test_cmp expect actual
+	'
+done <<-EOF
+abc abc 0 3
+abc def -1 0
+abc abz -1 2
+abc abcdef -1 3
+EOF
+
+test_done
-- 
2.9.3


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v12 2/5] p0006-read-tree-checkout: perf test to time read-tree
  2017-04-19 17:06 [PATCH v12 0/5] read-cache: speed up add_index_entry git
  2017-04-19 17:06 ` [PATCH v12 1/5] read-cache: add strcmp_offset function git
@ 2017-04-19 17:06 ` git
  2017-04-19 17:06 ` [PATCH v12 3/5] read-cache: speed up add_index_entry during checkout git
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: git @ 2017-04-19 17:06 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Created t/perf/repos/many-files.sh to generate large, but
artificial repositories.

Created t/perf/inflate-repo.sh to alter an EXISTING repo
to have a set of large commits.  This can be used to create
a branch with 1M+ files in repositories like git.git or
linux.git, but with more realistic content.  It does this
by making multiple copies of the entire worktree in a series
of sub-directories.

The branch name and ballast structure created by both scripts
match, so either script can be used to generate very large
test repositories for the following perf test.

Created t/perf/p0006-read-tree-checkout.sh to measure
performance on various read-tree, checkout, and update-index
operations.  This test can run using either normal repos or
ones from the above scripts.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 t/perf/p0006-read-tree-checkout.sh |  67 ++++++++++++++++++++++
 t/perf/repos/.gitignore            |   1 +
 t/perf/repos/inflate-repo.sh       |  86 +++++++++++++++++++++++++++++
 t/perf/repos/many-files.sh         | 110 +++++++++++++++++++++++++++++++++++++
 4 files changed, 264 insertions(+)
 create mode 100755 t/perf/p0006-read-tree-checkout.sh
 create mode 100644 t/perf/repos/.gitignore
 create mode 100755 t/perf/repos/inflate-repo.sh
 create mode 100755 t/perf/repos/many-files.sh

diff --git a/t/perf/p0006-read-tree-checkout.sh b/t/perf/p0006-read-tree-checkout.sh
new file mode 100755
index 0000000..78cc23f
--- /dev/null
+++ b/t/perf/p0006-read-tree-checkout.sh
@@ -0,0 +1,67 @@
+#!/bin/sh
+#
+# This test measures the performance of various read-tree
+# and checkout operations.  It is primarily interested in
+# the algorithmic costs of index operations and recursive
+# tree traversal -- and NOT disk I/O on thousands of files.
+
+test_description="Tests performance of read-tree"
+
+. ./perf-lib.sh
+
+test_perf_default_repo
+
+# If the test repo was generated by ./repos/many-files.sh
+# then we know something about the data shape and branches,
+# so we can isolate testing to the ballast-related commits
+# and setup sparse-checkout so we don't have to populate
+# the ballast files and directories.
+#
+# Otherwise, we make some general assumptions about the
+# repo and consider the entire history of the current
+# branch to be the ballast.
+
+test_expect_success "setup repo" '
+	if git rev-parse --verify refs/heads/p0006-ballast^{commit}
+	then
+		echo Assuming synthetic repo from many-files.sh
+		git branch br_base            master
+		git branch br_ballast         p0006-ballast^
+		git branch br_ballast_alias   p0006-ballast^
+		git branch br_ballast_plus_1  p0006-ballast
+		git config --local core.sparsecheckout 1
+		cat >.git/info/sparse-checkout <<-EOF
+		/*
+		!ballast/*
+		EOF
+	else
+		echo Assuming non-synthetic repo...
+		git branch br_base            $(git rev-list HEAD | tail -n 1)
+		git branch br_ballast         HEAD^ || error "no ancestor commit from current head"
+		git branch br_ballast_alias   HEAD^
+		git branch br_ballast_plus_1  HEAD
+	fi &&
+	git checkout -q br_ballast &&
+	nr_files=$(git ls-files | wc -l)
+'
+
+test_perf "read-tree br_base br_ballast ($nr_files)" '
+	git read-tree -m br_base br_ballast -n
+'
+
+test_perf "switch between br_base br_ballast ($nr_files)" '
+	git checkout -q br_base &&
+	git checkout -q br_ballast
+'
+
+test_perf "switch between br_ballast br_ballast_plus_1 ($nr_files)" '
+	git checkout -q br_ballast_plus_1 &&
+	git checkout -q br_ballast
+'
+
+test_perf "switch between aliases ($nr_files)" '
+	git checkout -q br_ballast_alias &&
+	git checkout -q br_ballast
+'
+
+test_done
diff --git a/t/perf/repos/.gitignore b/t/perf/repos/.gitignore
new file mode 100644
index 0000000..72e3dc3
--- /dev/null
+++ b/t/perf/repos/.gitignore
@@ -0,0 +1 @@
+gen-*/
diff --git a/t/perf/repos/inflate-repo.sh b/t/perf/repos/inflate-repo.sh
new file mode 100755
index 0000000..64f5d7a
--- /dev/null
+++ b/t/perf/repos/inflate-repo.sh
@@ -0,0 +1,86 @@
+#!/bin/sh
+# Inflate the size of an EXISTING repo.
+#
+# This script should be run inside the worktree of a TEST repo.
+# It will use the contents of the current HEAD to generate a
+# commit containing copies of the current worktree such that the
+# total size of the commit has at least <target_size> files.
+#
+# Usage: [-t target_size] [-b branch_name]
+
+set -e
+
+target_size=10000
+branch_name=p0006-ballast
+ballast=ballast
+
+while test "$#" -ne 0
+do
+    case "$1" in
+	-b)
+	    shift;
+	    test "$#" -ne 0 || { echo 'error: -b requires an argument' >&2; exit 1; }
+	    branch_name=$1;
+	    shift ;;
+	-t)
+	    shift;
+	    test "$#" -ne 0 || { echo 'error: -t requires an argument' >&2; exit 1; }
+	    target_size=$1;
+	    shift ;;
+	*)
+	    echo "error: unknown option '$1'" >&2; exit 1 ;;
+    esac
+done
+
+git ls-tree -r HEAD >GEN_src_list
+nr_src_files=$(cat GEN_src_list | wc -l)
+
+src_branch=$(git symbolic-ref --short HEAD)
+
+echo "Branch $src_branch initially has $nr_src_files files."
+
+if test $target_size -le $nr_src_files
+then
+    echo "Repository already exceeds target size $target_size."
+    rm GEN_src_list
+    exit 1
+fi
+
+# Create well-known branch and add 1 file change to start
+# if off before the ballast.
+git checkout -b $branch_name HEAD
+echo "$target_size" > inflate-repo.params
+git add inflate-repo.params
+git commit -q -m params
+
+# Create ballast for in our branch.
+copy=1
+nr_files=$nr_src_files
+while test $nr_files -lt $target_size
+do
+    sed -e "s|	|	$ballast/$copy/|" <GEN_src_list |
+	git update-index --index-info
+
+    nr_files=$(expr $nr_files + $nr_src_files)
+    copy=$(expr $copy + 1)
+done
+rm GEN_src_list
+git commit -q -m "ballast"
+
+# Modify 1 file and commit.
+echo "$target_size" >> inflate-repo.params
+git add inflate-repo.params
+git commit -q -m "ballast plus 1"
+
+nr_files=$(git ls-files | wc -l)
+
+# Checkout master to put repo in canonical state (because
+# the perf test may need to clone and enable sparse-checkout
+# before attempting to checkout a commit with the ballast
+# (because it may contain 100K directories and 1M files)).
+git checkout $src_branch
+
+echo "Repository inflated. Branch $branch_name has $nr_files files."
+
+exit 0
+
diff --git a/t/perf/repos/many-files.sh b/t/perf/repos/many-files.sh
new file mode 100755
index 0000000..28720e4
--- /dev/null
+++ b/t/perf/repos/many-files.sh
@@ -0,0 +1,110 @@
+#!/bin/sh
+# Generate test data repository using the given parameters.
+# When omitted, we create "gen-many-files-d-w-f.git".
+#
+# Usage: [-r repo] [-d depth] [-w width] [-f files]
+#
+# -r repo: path to the new repo to be generated
+# -d depth: the depth of sub-directories
+# -w width: the number of sub-directories at each level
+# -f files: the number of files created in each directory
+#
+# Note that all files will have the same SHA-1 and each
+# directory at a level will have the same SHA-1, so we
+# will potentially have a large index, but not a large
+# ODB.
+#
+# Ballast will be created under "ballast/".
+
+EMPTY_BLOB=e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
+
+set -e
+
+# (5, 10, 9) will create 999,999 ballast files.
+# (4, 10, 9) will create  99,999 ballast files.
+depth=5
+width=10
+files=9
+
+while test "$#" -ne 0
+do
+    case "$1" in
+	-r)
+	    shift;
+	    test "$#" -ne 0 || { echo 'error: -r requires an argument' >&2; exit 1; }
+	    repo=$1;
+	    shift ;;
+	-d)
+	    shift;
+	    test "$#" -ne 0 || { echo 'error: -d requires an argument' >&2; exit 1; }
+	    depth=$1;
+	    shift ;;
+	-w)
+	    shift;
+	    test "$#" -ne 0 || { echo 'error: -w requires an argument' >&2; exit 1; }
+	    width=$1;
+	    shift ;;
+	-f)
+	    shift;
+	    test "$#" -ne 0 || { echo 'error: -f requires an argument' >&2; exit 1; }
+	    files=$1;
+	    shift ;;
+	*)
+	    echo "error: unknown option '$1'" >&2; exit 1 ;;
+	esac
+done
+
+# Inflate the index with thousands of empty files.
+# usage: dir depth width files
+fill_index() {
+	awk -v arg_dir=$1 -v arg_depth=$2 -v arg_width=$3 -v arg_files=$4 '
+		function make_paths(dir, depth, width, files, f, w) {
+			for (f = 1; f <= files; f++) {
+				print dir "/file" f
+			}
+			if (depth > 0) {
+				for (w = 1; w <= width; w++) {
+					make_paths(dir "/dir" w, depth - 1, width, files)
+				}
+			}
+		}
+		END { make_paths(arg_dir, arg_depth, arg_width, arg_files) }
+		' </dev/null |
+	sed "s/^/100644 $EMPTY_BLOB	/" |
+	git update-index --index-info
+	return 0
+}
+
+[ -z "$repo" ] && repo=gen-many-files-$depth.$width.$files.git
+
+mkdir $repo
+cd $repo
+git init .
+
+# Create an initial commit just to define master.
+touch many-files.empty
+echo "$depth $width $files" >many-files.params
+git add many-files.*
+git commit -q -m params
+
+# Create ballast for p0006 based upon the given params and
+# inflate the index with thousands of empty files and commit.
+git checkout -b p0006-ballast
+fill_index "ballast" $depth $width $files
+git commit -q -m "ballast"
+
+nr_files=$(git ls-files | wc -l)
+
+# Modify 1 file and commit.
+echo "$depth $width $files" >>many-files.params
+git add many-files.params
+git commit -q -m "ballast plus 1"
+
+# Checkout master to put repo in canonical state (because
+# the perf test may need to clone and enable sparse-checkout
+# before attempting to checkout a commit with the ballast
+# (because it may contain 100K directories and 1M files)).
+git checkout master
+
+echo "Repository "$repo" ($depth, $width, $files) created.  Ballast $nr_files."
+exit 0
-- 
2.9.3


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v12 3/5] read-cache: speed up add_index_entry during checkout
  2017-04-19 17:06 [PATCH v12 0/5] read-cache: speed up add_index_entry git
  2017-04-19 17:06 ` [PATCH v12 1/5] read-cache: add strcmp_offset function git
  2017-04-19 17:06 ` [PATCH v12 2/5] p0006-read-tree-checkout: perf test to time read-tree git
@ 2017-04-19 17:06 ` git
  2017-04-19 17:06 ` [PATCH v12 4/5] read-cache: speed up has_dir_name (part 1) git
  2017-04-19 17:06 ` [PATCH v12 5/5] read-cache: speed up has_dir_name (part 2) git
  4 siblings, 0 replies; 11+ messages in thread
From: git @ 2017-04-19 17:06 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Teach add_index_entry_with_check() to see if the path
of the new item is greater than the last path in the
index array before attempting to search for it.

During checkout, merge_working_tree() populates the new
index in sorted order, so this change will save a binary
lookups per file.  This preserves the original behavior
but simply checks the last element before starting the
search.

This helps performance on very large repositories.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 read-cache.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/read-cache.c b/read-cache.c
index 97f13a1..6a27688 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1021,7 +1021,16 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e
 
 	if (!(option & ADD_CACHE_KEEP_CACHE_TREE))
 		cache_tree_invalidate_path(istate, ce->name);
-	pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
+
+	/*
+	 * If this entry's path sorts after the last entry in the index,
+	 * we can avoid searching for it.
+	 */
+	if (istate->cache_nr > 0 &&
+		strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0)
+		pos = -istate->cache_nr - 1;
+	else
+		pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
 
 	/* existing match? Just replace it. */
 	if (pos >= 0) {
-- 
2.9.3


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v12 4/5] read-cache: speed up has_dir_name (part 1)
  2017-04-19 17:06 [PATCH v12 0/5] read-cache: speed up add_index_entry git
                   ` (2 preceding siblings ...)
  2017-04-19 17:06 ` [PATCH v12 3/5] read-cache: speed up add_index_entry during checkout git
@ 2017-04-19 17:06 ` git
  2017-04-19 17:06 ` [PATCH v12 5/5] read-cache: speed up has_dir_name (part 2) git
  4 siblings, 0 replies; 11+ messages in thread
From: git @ 2017-04-19 17:06 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Teach has_dir_name() to see if the path of the new item
is greater than the last path in the index array before
attempting to search for it.

has_dir_name() is looking for file/directory collisions
in the index and has to consider each sub-directory
prefix in turn.  This can cause multiple binary searches
for each path.

During operations like checkout, merge_working_tree()
populates the new index in sorted order, so we expect
to be able to append in many cases.

This commit is part 1 of 2.  This commit handles the top
of has_dir_name() and the trivial optimization.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 read-cache.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/read-cache.c b/read-cache.c
index 6a27688..9af0bd4 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -910,6 +910,9 @@ int strcmp_offset(const char *s1, const char *s2, size_t *first_change)
 /*
  * Do we have another file with a pathname that is a proper
  * subset of the name we're trying to add?
+ *
+ * That is, is there another file in the index with a path
+ * that matches a sub-directory in the given entry?
  */
 static int has_dir_name(struct index_state *istate,
 			const struct cache_entry *ce, int pos, int ok_to_replace)
@@ -918,6 +921,48 @@ static int has_dir_name(struct index_state *istate,
 	int stage = ce_stage(ce);
 	const char *name = ce->name;
 	const char *slash = name + ce_namelen(ce);
+	size_t len_eq_last;
+	int cmp_last = 0;
+
+	/*
+	 * We are frequently called during an iteration on a sorted
+	 * list of pathnames and while building a new index.  Therefore,
+	 * there is a high probability that this entry will eventually
+	 * be appended to the index, rather than inserted in the middle.
+	 * If we can confirm that, we can avoid binary searches on the
+	 * components of the pathname.
+	 *
+	 * Compare the entry's full path with the last path in the index.
+	 */
+	if (istate->cache_nr > 0) {
+		cmp_last = strcmp_offset(name,
+			istate->cache[istate->cache_nr - 1]->name,
+			&len_eq_last);
+		if (cmp_last > 0) {
+			if (len_eq_last == 0) {
+				/*
+				 * The entry sorts AFTER the last one in the
+				 * index and their paths have no common prefix,
+				 * so there cannot be a F/D conflict.
+				 */
+				return retval;
+			} else {
+				/*
+				 * The entry sorts AFTER the last one in the
+				 * index, but has a common prefix.  Fall through
+				 * to the loop below to disect the entry's path
+				 * and see where the difference is.
+				 */
+			}
+		} else if (cmp_last == 0) {
+			/*
+			 * The entry exactly matches the last one in the
+			 * index, but because of multiple stage and CE_REMOVE
+			 * items, we fall through and let the regular search
+			 * code handle it.
+			 */
+		}
+	}
 
 	for (;;) {
 		int len;
-- 
2.9.3


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v12 5/5] read-cache: speed up has_dir_name (part 2)
  2017-04-19 17:06 [PATCH v12 0/5] read-cache: speed up add_index_entry git
                   ` (3 preceding siblings ...)
  2017-04-19 17:06 ` [PATCH v12 4/5] read-cache: speed up has_dir_name (part 1) git
@ 2017-04-19 17:06 ` git
  2020-07-04 17:27   ` SZEDER Gábor
  4 siblings, 1 reply; 11+ messages in thread
From: git @ 2017-04-19 17:06 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Teach has_dir_name() to see if the path of the new item
is greater than the last path in the index array before
attempting to search for it.

has_dir_name() is looking for file/directory collisions
in the index and has to consider each sub-directory
prefix in turn.  This can cause multiple binary searches
for each path.

During operations like checkout, merge_working_tree()
populates the new index in sorted order, so we expect
to be able to append in many cases.

This commit is part 2 of 2.  This commit handles the
additional possible short-cuts as we look at each
sub-directory prefix.

The net-net gains for add_index_entry_with_check() and
both had_dir_name() commits are best seen for very
large repos.

Here are results for an INFLATED version of linux.git
with 1M files.

$ GIT_PERF_REPO=/mnt/test/linux_inflated.git/ ./run upstream/base HEAD ./p0006-read-tree-checkout.sh
Test                                                            upstream/base      HEAD
0006.2: read-tree br_base br_ballast (1043893)                  3.79(3.63+0.15)    2.68(2.52+0.15) -29.3%
0006.3: switch between br_base br_ballast (1043893)             7.55(6.58+0.44)    6.03(4.60+0.43) -20.1%
0006.4: switch between br_ballast br_ballast_plus_1 (1043893)   10.84(9.26+0.59)   8.44(7.06+0.65) -22.1%
0006.5: switch between aliases (1043893)                        10.93(9.39+0.58)   10.24(7.04+0.63) -6.3%

Here are results for a synthetic repo with 4.2M files.

$ GIT_PERF_REPO=~/work/gfw/t/perf/repos/gen-many-files-10.4.3.git/ ./run HEAD~3 HEAD ./p0006-read-tree-checkout.sh
Test                                                            HEAD~3               HEAD
0006.2: read-tree br_base br_ballast (4194305)                  29.96(19.26+10.50)   23.76(13.42+10.12) -20.7%
0006.3: switch between br_base br_ballast (4194305)             56.95(36.08+16.83)   45.54(25.94+15.68) -20.0%
0006.4: switch between br_ballast br_ballast_plus_1 (4194305)   90.94(51.50+31.52)   78.22(39.39+30.70) -14.0%
0006.5: switch between aliases (4194305)                        93.72(51.63+34.09)   77.94(39.00+30.88) -16.8%

Results for medium repos (like linux.git) are mixed and have
more variance (probably do to disk IO unrelated to this test.

$ GIT_PERF_REPO=/mnt/test/linux.git/ ./run HEAD~3 HEAD ./p0006-read-tree-checkout.sh
Test                                                          HEAD~3             HEAD
0006.2: read-tree br_base br_ballast (57994)                  0.25(0.21+0.03)    0.20(0.17+0.02) -20.0%
0006.3: switch between br_base br_ballast (57994)             10.67(6.06+2.92)   10.51(5.94+2.91) -1.5%
0006.4: switch between br_ballast br_ballast_plus_1 (57994)   0.59(0.47+0.16)    0.52(0.40+0.13) -11.9%
0006.5: switch between aliases (57994)                        0.59(0.44+0.17)    0.51(0.38+0.14) -13.6%

$ GIT_PERF_REPO=/mnt/test/linux.git/ ./run HEAD~3 HEAD ./p0006-read-tree-checkout.sh
Test                                                          HEAD~3             HEAD
0006.2: read-tree br_base br_ballast (57994)                  0.24(0.21+0.02)    0.21(0.18+0.02) -12.5%
0006.3: switch between br_base br_ballast (57994)             10.42(5.98+2.91)   10.66(5.86+3.09) +2.3%
0006.4: switch between br_ballast br_ballast_plus_1 (57994)   0.59(0.49+0.13)    0.53(0.37+0.16) -10.2%
0006.5: switch between aliases (57994)                        0.59(0.43+0.17)    0.50(0.37+0.14) -15.3%

Results for smaller repos (like git.git) are not significant.
$ ./run HEAD~3 HEAD ./p0006-read-tree-checkout.sh
Test                                                         HEAD~3            HEAD
0006.2: read-tree br_base br_ballast (3043)                  0.01(0.00+0.00)   0.01(0.00+0.00) +0.0%
0006.3: switch between br_base br_ballast (3043)             0.31(0.17+0.11)   0.29(0.19+0.08) -6.5%
0006.4: switch between br_ballast br_ballast_plus_1 (3043)   0.03(0.02+0.00)   0.03(0.02+0.00) +0.0%
0006.5: switch between aliases (3043)                        0.03(0.02+0.00)   0.03(0.02+0.00) +0.0%

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 read-cache.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/read-cache.c b/read-cache.c
index 9af0bd4..c252b6c 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -965,7 +965,7 @@ static int has_dir_name(struct index_state *istate,
 	}
 
 	for (;;) {
-		int len;
+		size_t len;
 
 		for (;;) {
 			if (*--slash == '/')
@@ -975,6 +975,67 @@ static int has_dir_name(struct index_state *istate,
 		}
 		len = slash - name;
 
+		if (cmp_last > 0) {
+			/*
+			 * (len + 1) is a directory boundary (including
+			 * the trailing slash).  And since the loop is
+			 * decrementing "slash", the first iteration is
+			 * the longest directory prefix; subsequent
+			 * iterations consider parent directories.
+			 */
+
+			if (len + 1 <= len_eq_last) {
+				/*
+				 * The directory prefix (including the trailing
+				 * slash) also appears as a prefix in the last
+				 * entry, so the remainder cannot collide (because
+				 * strcmp said the whole path was greater).
+				 *
+				 * EQ: last: xxx/A
+				 *     this: xxx/B
+				 *
+				 * LT: last: xxx/file_A
+				 *     this: xxx/file_B
+				 */
+				return retval;
+			}
+
+			if (len > len_eq_last) {
+				/*
+				 * This part of the directory prefix (excluding
+				 * the trailing slash) is longer than the known
+				 * equal portions, so this sub-directory cannot
+				 * collide with a file.
+				 *
+				 * GT: last: xxxA
+				 *     this: xxxB/file
+				 */
+				return retval;
+			}
+
+			if (istate->cache_nr > 0 &&
+				ce_namelen(istate->cache[istate->cache_nr - 1]) > len) {
+				/*
+				 * The directory prefix lines up with part of
+				 * a longer file or directory name, but sorts
+				 * after it, so this sub-directory cannot
+				 * collide with a file.
+				 *
+				 * last: xxx/yy-file (because '-' sorts before '/')
+				 * this: xxx/yy/abc
+				 */
+				return retval;
+			}
+
+			/*
+			 * This is a possible collision. Fall through and
+			 * let the regular search code handle it.
+			 *
+			 * last: xxx
+			 * this: xxx/file
+			 */
+		}
+
 		pos = index_name_stage_pos(istate, name, len, stage);
 		if (pos >= 0) {
 			/*
-- 
2.9.3


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v12 5/5] read-cache: speed up has_dir_name (part 2)
  2017-04-19 17:06 ` [PATCH v12 5/5] read-cache: speed up has_dir_name (part 2) git
@ 2020-07-04 17:27   ` SZEDER Gábor
  2020-07-05  5:27     ` René Scharfe
  2020-07-06  6:39     ` Junio C Hamano
  0 siblings, 2 replies; 11+ messages in thread
From: SZEDER Gábor @ 2020-07-04 17:27 UTC (permalink / raw)
  To: git
  Cc: git, gitster, peff, Jeff Hostetler, René Scharfe, Brandon Williams

On Wed, Apr 19, 2017 at 05:06:18PM +0000, git@jeffhostetler.com wrote:
> From: Jeff Hostetler <jeffhost@microsoft.com>
> 
> Teach has_dir_name() to see if the path of the new item
> is greater than the last path in the index array before
> attempting to search for it.
> 
> has_dir_name() is looking for file/directory collisions
> in the index and has to consider each sub-directory
> prefix in turn.  This can cause multiple binary searches
> for each path.
> 
> During operations like checkout, merge_working_tree()
> populates the new index in sorted order, so we expect
> to be able to append in many cases.
> 
> This commit is part 2 of 2.  This commit handles the
> additional possible short-cuts as we look at each
> sub-directory prefix.
> 
> The net-net gains for add_index_entry_with_check() and
> both had_dir_name() commits are best seen for very
> large repos.
> 
> Here are results for an INFLATED version of linux.git
> with 1M files.
> 
> $ GIT_PERF_REPO=/mnt/test/linux_inflated.git/ ./run upstream/base HEAD ./p0006-read-tree-checkout.sh
> Test                                                            upstream/base      HEAD
> 0006.2: read-tree br_base br_ballast (1043893)                  3.79(3.63+0.15)    2.68(2.52+0.15) -29.3%
> 0006.3: switch between br_base br_ballast (1043893)             7.55(6.58+0.44)    6.03(4.60+0.43) -20.1%
> 0006.4: switch between br_ballast br_ballast_plus_1 (1043893)   10.84(9.26+0.59)   8.44(7.06+0.65) -22.1%
> 0006.5: switch between aliases (1043893)                        10.93(9.39+0.58)   10.24(7.04+0.63) -6.3%
> 
> Here are results for a synthetic repo with 4.2M files.
> 
> $ GIT_PERF_REPO=~/work/gfw/t/perf/repos/gen-many-files-10.4.3.git/ ./run HEAD~3 HEAD ./p0006-read-tree-checkout.sh
> Test                                                            HEAD~3               HEAD
> 0006.2: read-tree br_base br_ballast (4194305)                  29.96(19.26+10.50)   23.76(13.42+10.12) -20.7%
> 0006.3: switch between br_base br_ballast (4194305)             56.95(36.08+16.83)   45.54(25.94+15.68) -20.0%
> 0006.4: switch between br_ballast br_ballast_plus_1 (4194305)   90.94(51.50+31.52)   78.22(39.39+30.70) -14.0%
> 0006.5: switch between aliases (4194305)                        93.72(51.63+34.09)   77.94(39.00+30.88) -16.8%
> 
> Results for medium repos (like linux.git) are mixed and have
> more variance (probably do to disk IO unrelated to this test.
> 
> $ GIT_PERF_REPO=/mnt/test/linux.git/ ./run HEAD~3 HEAD ./p0006-read-tree-checkout.sh
> Test                                                          HEAD~3             HEAD
> 0006.2: read-tree br_base br_ballast (57994)                  0.25(0.21+0.03)    0.20(0.17+0.02) -20.0%
> 0006.3: switch between br_base br_ballast (57994)             10.67(6.06+2.92)   10.51(5.94+2.91) -1.5%
> 0006.4: switch between br_ballast br_ballast_plus_1 (57994)   0.59(0.47+0.16)    0.52(0.40+0.13) -11.9%
> 0006.5: switch between aliases (57994)                        0.59(0.44+0.17)    0.51(0.38+0.14) -13.6%
> 
> $ GIT_PERF_REPO=/mnt/test/linux.git/ ./run HEAD~3 HEAD ./p0006-read-tree-checkout.sh
> Test                                                          HEAD~3             HEAD
> 0006.2: read-tree br_base br_ballast (57994)                  0.24(0.21+0.02)    0.21(0.18+0.02) -12.5%
> 0006.3: switch between br_base br_ballast (57994)             10.42(5.98+2.91)   10.66(5.86+3.09) +2.3%
> 0006.4: switch between br_ballast br_ballast_plus_1 (57994)   0.59(0.49+0.13)    0.53(0.37+0.16) -10.2%
> 0006.5: switch between aliases (57994)                        0.59(0.43+0.17)    0.50(0.37+0.14) -15.3%
> 
> Results for smaller repos (like git.git) are not significant.
> $ ./run HEAD~3 HEAD ./p0006-read-tree-checkout.sh
> Test                                                         HEAD~3            HEAD
> 0006.2: read-tree br_base br_ballast (3043)                  0.01(0.00+0.00)   0.01(0.00+0.00) +0.0%
> 0006.3: switch between br_base br_ballast (3043)             0.31(0.17+0.11)   0.29(0.19+0.08) -6.5%
> 0006.4: switch between br_ballast br_ballast_plus_1 (3043)   0.03(0.02+0.00)   0.03(0.02+0.00) +0.0%
> 0006.5: switch between aliases (3043)                        0.03(0.02+0.00)   0.03(0.02+0.00) +0.0%
> 
> Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
> ---
>  read-cache.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 62 insertions(+), 1 deletion(-)
> 
> diff --git a/read-cache.c b/read-cache.c
> index 9af0bd4..c252b6c 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -965,7 +965,7 @@ static int has_dir_name(struct index_state *istate,
>  	}
>  
>  	for (;;) {
> -		int len;
> +		size_t len;
>  
>  		for (;;) {
>  			if (*--slash == '/')
> @@ -975,6 +975,67 @@ static int has_dir_name(struct index_state *istate,
>  		}
>  		len = slash - name;
>  
> +		if (cmp_last > 0) {
> +			/*
> +			 * (len + 1) is a directory boundary (including
> +			 * the trailing slash).  And since the loop is
> +			 * decrementing "slash", the first iteration is
> +			 * the longest directory prefix; subsequent
> +			 * iterations consider parent directories.
> +			 */
> +
> +			if (len + 1 <= len_eq_last) {
> +				/*
> +				 * The directory prefix (including the trailing
> +				 * slash) also appears as a prefix in the last
> +				 * entry, so the remainder cannot collide (because
> +				 * strcmp said the whole path was greater).
> +				 *
> +				 * EQ: last: xxx/A
> +				 *     this: xxx/B
> +				 *
> +				 * LT: last: xxx/file_A
> +				 *     this: xxx/file_B
> +				 */
> +				return retval;
> +			}
> +
> +			if (len > len_eq_last) {
> +				/*
> +				 * This part of the directory prefix (excluding
> +				 * the trailing slash) is longer than the known
> +				 * equal portions, so this sub-directory cannot
> +				 * collide with a file.
> +				 *
> +				 * GT: last: xxxA
> +				 *     this: xxxB/file
> +				 */
> +				return retval;
> +			}
> +
> +			if (istate->cache_nr > 0 &&
> +				ce_namelen(istate->cache[istate->cache_nr - 1]) > len) {
> +				/*
> +				 * The directory prefix lines up with part of
> +				 * a longer file or directory name, but sorts
> +				 * after it, so this sub-directory cannot
> +				 * collide with a file.
> +				 *
> +				 * last: xxx/yy-file (because '-' sorts before '/')
> +				 * this: xxx/yy/abc

This is problematic, because the index can already contain 'xxx/yy' as
a file, when adding 'xxx/yy/abc', but since 'xxx/yy' as a file sorts
before 'xxx/yy-file', the short-circuiting here doesn't see it and
thus leaves the d-f collision undetected.  Consequently, even Git
porcelain commands can create tree objects with duplicate entries, as
demonstrated in the tests below.

Before this patch the collision is noticed, the colliding file entry
is removed, and these tests succeed.  Removing this condition make
them succeed as well, and FWIW the test suite still passes, but I
haven't thought it through and haven't checked the impact on
performance.

  ---  >8  ---

diff --git a/t/t9999-test.sh b/t/t9999-test.sh
new file mode 100755
index 0000000000..4c802d5940
--- /dev/null
+++ b/t/t9999-test.sh
@@ -0,0 +1,47 @@
+#!/bin/sh
+
+test_description='test'
+
+. ./test-lib.sh
+
+test_expect_success 'file to dir breakage' '
+	>file-to-dir &&
+	# This sorts between "file-to-dir" as a file and "file-to-dir"
+	# as a directory (with the trailing / appended implicitly.
+	>file-to-dir.uh-oh &&
+	git add file-to-dir file-to-dir.uh-oh &&
+	git commit -m "add a file" &&
+
+	# Not "git rm", to preserve "file-to-dir" in the index.
+	rm file-to-dir &&
+	mkdir file-to-dir &&
+	>file-to-dir/file &&
+
+	# It is important to add the file in the directory; adding only
+	# the directory doesnt trigger the bug.
+	git add file-to-dir/file &&
+
+	git diff --cached --no-renames --name-status >actual &&
+
+	cat >expect <<-\EOF &&
+	D	file-to-dir
+	A	file-to-dir/file
+	EOF
+	test_cmp expect actual
+'
+
+test_expect_success 'is it committed as-is?' '
+	git commit -m "replace file with a dir" &&
+	git ls-tree HEAD >actual &&
+
+	# Hardcoded SHA-1 oid :(, because with this bug present
+	# "git rev-parse HEAD:file-to-dir" doesnt show the oid of
+	# the tree, but the oid of the blob that shouldnt be there.
+	cat >expect <<-EOF &&
+	100644 blob $EMPTY_BLOB	file-to-dir.uh-oh
+	040000 tree df2b8fc99e1c1d4dbc0a854d9f72157f1d6ea078	file-to-dir
+	EOF
+	test_cmp expect actual
+'
+
+test_done

  ---  8<  ---

The first test fails with:

  + test_cmp expect actual
  --- expect      2020-07-04 16:52:07.173296314 +0000
  +++ actual      2020-07-04 16:52:07.173296314 +0000
  @@ -1,2 +1 @@
  -D      file-to-dir
   A      file-to-dir/file
  error: last command exited with $?=1
  not ok 1 - file to dir breakage
  
And the second:

  + test_cmp expect actual
  --- expect      2020-07-04 16:52:07.185296659 +0000
  +++ actual      2020-07-04 16:52:07.181296544 +0000
  @@ -1,2 +1,3 @@
  +100644 blob e69de29bb2d1d6434b8b29ae775ad8c2e48c5391   file-to-dir
   100644 blob e69de29bb2d1d6434b8b29ae775ad8c2e48c5391   file-to-dir.uh-oh
   040000 tree df2b8fc99e1c1d4dbc0a854d9f72157f1d6ea078   file-to-dir
  error: last command exited with $?=1
  not ok 2 - is it committed as-is?


> +				 */
> +				return retval;
> +			}
> +
> +			/*
> +			 * This is a possible collision. Fall through and
> +			 * let the regular search code handle it.
> +			 *
> +			 * last: xxx
> +			 * this: xxx/file
> +			 */
> +		}
> +
>  		pos = index_name_stage_pos(istate, name, len, stage);
>  		if (pos >= 0) {
>  			/*
> -- 
> 2.9.3
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v12 5/5] read-cache: speed up has_dir_name (part 2)
  2020-07-04 17:27   ` SZEDER Gábor
@ 2020-07-05  5:27     ` René Scharfe
  2020-07-06  6:39     ` Junio C Hamano
  1 sibling, 0 replies; 11+ messages in thread
From: René Scharfe @ 2020-07-05  5:27 UTC (permalink / raw)
  To: SZEDER Gábor, git
  Cc: git, gitster, peff, Jeff Hostetler, Brandon Williams

Am 04.07.20 um 19:27 schrieb SZEDER Gábor:
> diff --git a/t/t9999-test.sh b/t/t9999-test.sh
> new file mode 100755
> index 0000000000..4c802d5940
> --- /dev/null
> +++ b/t/t9999-test.sh
> @@ -0,0 +1,47 @@
> +#!/bin/sh
> +
> +test_description='test'
> +
> +. ./test-lib.sh
> +
> +test_expect_success 'file to dir breakage' '
> +	>file-to-dir &&
> +	# This sorts between "file-to-dir" as a file and "file-to-dir"
> +	# as a directory (with the trailing / appended implicitly.
> +	>file-to-dir.uh-oh &&
> +	git add file-to-dir file-to-dir.uh-oh &&
> +	git commit -m "add a file" &&
> +
> +	# Not "git rm", to preserve "file-to-dir" in the index.
> +	rm file-to-dir &&
> +	mkdir file-to-dir &&
> +	>file-to-dir/file &&
> +
> +	# It is important to add the file in the directory; adding only
> +	# the directory doesnt trigger the bug.
> +	git add file-to-dir/file &&
> +
> +	git diff --cached --no-renames --name-status >actual &&
> +
> +	cat >expect <<-\EOF &&
> +	D	file-to-dir
> +	A	file-to-dir/file
> +	EOF
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'is it committed as-is?' '
> +	git commit -m "replace file with a dir" &&
> +	git ls-tree HEAD >actual &&
> +
> +	# Hardcoded SHA-1 oid :(, because with this bug present
> +	# "git rev-parse HEAD:file-to-dir" doesnt show the oid of
> +	# the tree, but the oid of the blob that shouldnt be there.
> +	cat >expect <<-EOF &&
> +	100644 blob $EMPTY_BLOB	file-to-dir.uh-oh
> +	040000 tree df2b8fc99e1c1d4dbc0a854d9f72157f1d6ea078	file-to-dir

You can get the hash of the tree with:

  echo "100644 blob $EMPTY_BLOB	file" | git mktree

(TAB before "file", the rest of the whitespace are regular spaces)

> +	EOF
> +	test_cmp expect actual
> +'
> +
> +test_done
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v12 5/5] read-cache: speed up has_dir_name (part 2)
  2020-07-04 17:27   ` SZEDER Gábor
  2020-07-05  5:27     ` René Scharfe
@ 2020-07-06  6:39     ` Junio C Hamano
  2020-07-08 13:49       ` Jeff Hostetler
  1 sibling, 1 reply; 11+ messages in thread
From: Junio C Hamano @ 2020-07-06  6:39 UTC (permalink / raw)
  To: SZEDER Gábor
  Cc: git, git, peff, Jeff Hostetler, René Scharfe, Brandon Williams

SZEDER Gábor <szeder.dev@gmail.com> writes:

>> +				 * last: xxx/yy-file (because '-' sorts before '/')
>> +				 * this: xxx/yy/abc
>
> This is problematic, because the index can already contain 'xxx/yy' as
> a file, when adding 'xxx/yy/abc', but since 'xxx/yy' as a file sorts
> before 'xxx/yy-file', the short-circuiting here doesn't see it and
> thus leaves the d-f collision undetected.  Consequently, even Git
> porcelain commands can create tree objects with duplicate entries, as
> demonstrated in the tests below.

Yeah, the "optimization" is quite bogus.  Thanks for catching it.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v12 5/5] read-cache: speed up has_dir_name (part 2)
  2020-07-06  6:39     ` Junio C Hamano
@ 2020-07-08 13:49       ` Jeff Hostetler
  2020-07-16 17:11         ` René Scharfe
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Hostetler @ 2020-07-08 13:49 UTC (permalink / raw)
  To: Junio C Hamano, SZEDER Gábor
  Cc: git, peff, Jeff Hostetler, René Scharfe, Brandon Williams



On 7/6/20 2:39 AM, Junio C Hamano wrote:
> SZEDER Gábor <szeder.dev@gmail.com> writes:
> 
>>> +				 * last: xxx/yy-file (because '-' sorts before '/')
>>> +				 * this: xxx/yy/abc
>>
>> This is problematic, because the index can already contain 'xxx/yy' as
>> a file, when adding 'xxx/yy/abc', but since 'xxx/yy' as a file sorts
>> before 'xxx/yy-file', the short-circuiting here doesn't see it and
>> thus leaves the d-f collision undetected.  Consequently, even Git
>> porcelain commands can create tree objects with duplicate entries, as
>> demonstrated in the tests below.
> 
> Yeah, the "optimization" is quite bogus.  Thanks for catching it.
> 

yes, thanks!
Jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v12 5/5] read-cache: speed up has_dir_name (part 2)
  2020-07-08 13:49       ` Jeff Hostetler
@ 2020-07-16 17:11         ` René Scharfe
  0 siblings, 0 replies; 11+ messages in thread
From: René Scharfe @ 2020-07-16 17:11 UTC (permalink / raw)
  To: Jeff Hostetler, Junio C Hamano, SZEDER Gábor
  Cc: git, peff, Jeff Hostetler, Brandon Williams

Am 08.07.20 um 15:49 schrieb Jeff Hostetler:
>
>
> On 7/6/20 2:39 AM, Junio C Hamano wrote:
>> SZEDER Gábor <szeder.dev@gmail.com> writes:
>>
>>>> +                 * last: xxx/yy-file (because '-' sorts before '/')
>>>> +                 * this: xxx/yy/abc
>>>
>>> This is problematic, because the index can already contain 'xxx/yy' as
>>> a file, when adding 'xxx/yy/abc', but since 'xxx/yy' as a file sorts
>>> before 'xxx/yy-file', the short-circuiting here doesn't see it and
>>> thus leaves the d-f collision undetected.  Consequently, even Git
>>> porcelain commands can create tree objects with duplicate entries, as
>>> demonstrated in the tests below.
>>
>> Yeah, the "optimization" is quite bogus.  Thanks for catching it.
>>
>
> yes, thanks!

OK, so now we just need to fix it.  Like this perhaps?

-- >8 --
From f4b2c70a34bb612e2ccc23a31e2ba8e01dedc373 Mon Sep 17 00:00:00 2001
Subject: [PATCH] read-cache: remove bogus shortcut

has_dir_name() has some optimizations for the case where entries are
added to an index in the correct order.  They kick in if the new entry
sorts after the last one.  One of them exits early if the last entry has
a longer name than the directory of the new entry.  Here's its comment:

/*
 * The directory prefix lines up with part of
 * a longer file or directory name, but sorts
 * after it, so this sub-directory cannot
 * collide with a file.
 *
 * last: xxx/yy-file (because '-' sorts before '/')
 * this: xxx/yy/abc
 */

However, a file named xxx/yy would be sorted before xxx/yy-file because
'-' sorts after NUL, so the length check against the last entry is not
sufficient to rule out a collision.  Remove it.

Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
Suggested-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: René Scharfe <l.s.r@web.de>
---
 read-cache.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index aa427c5c17..8ed1c29b54 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1171,20 +1171,6 @@ static int has_dir_name(struct index_state *istate,
 				return retval;
 			}

-			if (istate->cache_nr > 0 &&
-				ce_namelen(istate->cache[istate->cache_nr - 1]) > len) {
-				/*
-				 * The directory prefix lines up with part of
-				 * a longer file or directory name, but sorts
-				 * after it, so this sub-directory cannot
-				 * collide with a file.
-				 *
-				 * last: xxx/yy-file (because '-' sorts before '/')
-				 * this: xxx/yy/abc
-				 */
-				return retval;
-			}
-
 			/*
 			 * This is a possible collision. Fall through and
 			 * let the regular search code handle it.
--
2.27.0

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2020-07-16 17:12 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-19 17:06 [PATCH v12 0/5] read-cache: speed up add_index_entry git
2017-04-19 17:06 ` [PATCH v12 1/5] read-cache: add strcmp_offset function git
2017-04-19 17:06 ` [PATCH v12 2/5] p0006-read-tree-checkout: perf test to time read-tree git
2017-04-19 17:06 ` [PATCH v12 3/5] read-cache: speed up add_index_entry during checkout git
2017-04-19 17:06 ` [PATCH v12 4/5] read-cache: speed up has_dir_name (part 1) git
2017-04-19 17:06 ` [PATCH v12 5/5] read-cache: speed up has_dir_name (part 2) git
2020-07-04 17:27   ` SZEDER Gábor
2020-07-05  5:27     ` René Scharfe
2020-07-06  6:39     ` Junio C Hamano
2020-07-08 13:49       ` Jeff Hostetler
2020-07-16 17:11         ` René Scharfe

Code repositories for project(s) associated with this inbox:

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).