From: Ben Peart <Ben.Peart@microsoft.com>
To: "git@vger.kernel.org" <git@vger.kernel.org>,
"gitster@pobox.com" <gitster@pobox.com>
Cc: "pclouds@gmail.com" <pclouds@gmail.com>,
"alexmv@dropbox.com" <alexmv@dropbox.com>,
"blees@dcon.de" <blees@dcon.de>,
"bmwill@google.com" <bmwill@google.com>,
"avarab@gmail.com" <avarab@gmail.com>,
"johannes.schindelin@gmx.de" <johannes.schindelin@gmx.de>,
"martin.agren@gmail.com" <martin.agren@gmail.com>,
Ben Peart <Ben.Peart@microsoft.com>
Subject: [PATCH v3 1/2] fsexcludes: add a programmatic way to exclude files from git's working directory traversal logic
Date: Fri, 13 Apr 2018 12:22:52 +0000 [thread overview]
Message-ID: <20180413122218.1756-2-benpeart@microsoft.com> (raw)
In-Reply-To: <20180413122218.1756-1-benpeart@microsoft.com>
The File System Excludes module is a new programmatic way to exclude files and
folders from git's traversal of the working directory. fsexcludes_init() should
be called with a string buffer that contains a NUL separated list of path names
of the files and/or directories that should be included. Any path not listed
will be excluded. The paths should be relative to the root of the working
directory and be separated by a single NUL.
The excludes logic in dir.c has been updated to honor the results of
fsexcludes_is_excluded_from(). If fsexcludes does not exclude the file, the
normal excludes logic is also checked as it could further reduce the set of
files that should be included.
Signed-off-by: Ben Peart <benpeart@microsoft.com>
---
Makefile | 1 +
dir.c | 24 +++++-
fsexcludes.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++
fsexcludes.h | 29 +++++++
4 files changed, 263 insertions(+), 2 deletions(-)
create mode 100644 fsexcludes.c
create mode 100644 fsexcludes.h
diff --git a/Makefile b/Makefile
index f181687250..a4f1471272 100644
--- a/Makefile
+++ b/Makefile
@@ -822,6 +822,7 @@ LIB_OBJS += exec_cmd.o
LIB_OBJS += fetch-object.o
LIB_OBJS += fetch-pack.o
LIB_OBJS += fsck.o
+LIB_OBJS += fsexcludes.o
LIB_OBJS += fsmonitor.o
LIB_OBJS += gettext.o
LIB_OBJS += gpg-interface.o
diff --git a/dir.c b/dir.c
index 63a917be45..47a073efe1 100644
--- a/dir.c
+++ b/dir.c
@@ -18,6 +18,7 @@
#include "utf8.h"
#include "varint.h"
#include "ewah/ewok.h"
+#include "fsexcludes.h"
#include "fsmonitor.h"
/*
@@ -1102,6 +1103,12 @@ int is_excluded_from_list(const char *pathname,
struct exclude_list *el, struct index_state *istate)
{
struct exclude *exclude;
+
+ if (*dtype == DT_UNKNOWN)
+ *dtype = get_dtype(NULL, istate, pathname, pathlen);
+ if (fsexcludes_is_excluded_from(istate, pathname, pathlen, *dtype) > 0)
+ return 1;
+
exclude = last_exclude_matching_from_list(pathname, pathlen, basename,
dtype, el, istate);
if (exclude)
@@ -1317,8 +1324,15 @@ struct exclude *last_exclude_matching(struct dir_struct *dir,
int is_excluded(struct dir_struct *dir, struct index_state *istate,
const char *pathname, int *dtype_p)
{
- struct exclude *exclude =
- last_exclude_matching(dir, istate, pathname, dtype_p);
+ struct exclude *exclude;
+ int pathlen = strlen(pathname);
+
+ if (*dtype_p == DT_UNKNOWN)
+ *dtype_p = get_dtype(NULL, istate, pathname, pathlen);
+ if (fsexcludes_is_excluded_from(istate, pathname, pathlen, *dtype_p) > 0)
+ return 1;
+
+ exclude = last_exclude_matching(dir, istate, pathname, dtype_p);
if (exclude)
return exclude->flags & EXC_FLAG_NEGATIVE ? 0 : 1;
return 0;
@@ -1671,6 +1685,9 @@ static enum path_treatment treat_one_path(struct dir_struct *dir,
if (dtype != DT_DIR && has_path_in_index)
return path_none;
+ if (fsexcludes_is_excluded_from(istate, path->buf, path->len, dtype) > 0)
+ return path_excluded;
+
/*
* When we are looking at a directory P in the working tree,
* there are three cases:
@@ -2011,6 +2028,9 @@ static enum path_treatment read_directory_recursive(struct dir_struct *dir,
/* add the path to the appropriate result list */
switch (state) {
case path_excluded:
+ if (fsexcludes_is_excluded_from(istate, path.buf, path.len,
+ get_dtype(cdir.de, istate, path.buf, path.len)) > 0)
+ break;
if (dir->flags & DIR_SHOW_IGNORED)
dir_add_name(dir, istate, path.buf, path.len);
else if ((dir->flags & DIR_SHOW_IGNORED_TOO) ||
diff --git a/fsexcludes.c b/fsexcludes.c
new file mode 100644
index 0000000000..0ef57f107b
--- /dev/null
+++ b/fsexcludes.c
@@ -0,0 +1,211 @@
+#include "cache.h"
+#include "fsexcludes.h"
+#include "hashmap.h"
+#include "strbuf.h"
+
+static int fsexcludes_initialized = 0;
+static struct strbuf fsexcludes_data = STRBUF_INIT;
+static struct hashmap fsexcludes_hashmap;
+static struct hashmap parent_directory_hashmap;
+
+struct fsexcludes {
+ struct hashmap_entry ent; /* must be the first member! */
+ const char *pattern;
+ int patternlen;
+};
+
+static unsigned int(*fsexcludeshash)(const void *buf, size_t len);
+static int(*fsexcludescmp)(const char *a, const char *b, size_t len);
+
+static int fsexcludes_hashmap_cmp(const void *unused_cmp_data,
+ const void *a, const void *b, const void *key)
+{
+ const struct fsexcludes *fse1 = a;
+ const struct fsexcludes *fse2 = b;
+
+ return fsexcludescmp(fse1->pattern, fse2->pattern, fse1->patternlen);
+}
+
+static int check_fsexcludes_hashmap(struct hashmap *map, const char *pattern, int patternlen)
+{
+ struct strbuf sb = STRBUF_INIT;
+ struct fsexcludes fse;
+ char *slash;
+
+ /* Check straight mapping */
+ strbuf_add(&sb, pattern, patternlen);
+ fse.pattern = sb.buf;
+ fse.patternlen = sb.len;
+ hashmap_entry_init(&fse, fsexcludeshash(fse.pattern, fse.patternlen));
+ if (hashmap_get(map, &fse, NULL)) {
+ strbuf_release(&sb);
+ return 0;
+ }
+
+ /*
+ * Check to see if it matches a directory or any path
+ * underneath it. In other words, 'a/b/foo.txt' will match
+ * '/', 'a/', and 'a/b/'.
+ */
+ slash = strchr(sb.buf, '/');
+ while (slash) {
+ fse.pattern = sb.buf;
+ fse.patternlen = slash - sb.buf + 1;
+ hashmap_entry_init(&fse, fsexcludeshash(fse.pattern, fse.patternlen));
+ if (hashmap_get(map, &fse, NULL)) {
+ strbuf_release(&sb);
+ return 0;
+ }
+ slash = strchr(slash + 1, '/');
+ }
+
+ strbuf_release(&sb);
+ return 1;
+}
+
+static void fsexcludes_hashmap_add(struct hashmap *map, const char *pattern, const int patternlen)
+{
+ struct fsexcludes *fse;
+
+ fse = xmalloc(sizeof(struct fsexcludes));
+ fse->pattern = pattern;
+ fse->patternlen = patternlen;
+ hashmap_entry_init(fse, fsexcludeshash(fse->pattern, fse->patternlen));
+ hashmap_add(map, fse);
+}
+
+static void initialize_fsexcludes_hashmap(struct hashmap *map, struct strbuf *fsexcludes_data)
+{
+ char *buf, *entry;
+ size_t len;
+ int i;
+
+ /*
+ * Build a hashmap of the fsexcludes data we can use to look
+ * for cache entry matches quickly
+ */
+ fsexcludeshash = ignore_case ? memihash : memhash;
+ fsexcludescmp = ignore_case ? strncasecmp : strncmp;
+ hashmap_init(map, fsexcludes_hashmap_cmp, NULL, 0);
+
+ entry = buf = fsexcludes_data->buf;
+ len = fsexcludes_data->len;
+ for (i = 0; i < len; i++) {
+ if (buf[i] == '\0') {
+ fsexcludes_hashmap_add(map, entry, buf + i - entry);
+ entry = buf + i + 1;
+ }
+ }
+}
+
+static void parent_directory_hashmap_add(struct hashmap *map, const char *pattern, const int patternlen)
+{
+ char *slash;
+ struct fsexcludes *fse;
+
+ /*
+ * Add any directories leading up to the file as the excludes logic
+ * needs to match directories leading up to the files as well. Detect
+ * and prevent unnecessary duplicate entries which will be common.
+ */
+ if (patternlen > 1) {
+ slash = strchr(pattern + 1, '/');
+ while (slash) {
+ fse = xmalloc(sizeof(struct fsexcludes));
+ fse->pattern = pattern;
+ fse->patternlen = slash - pattern + 1;
+ hashmap_entry_init(fse, fsexcludeshash(fse->pattern, fse->patternlen));
+ if (hashmap_get(map, fse, NULL))
+ free(fse);
+ else
+ hashmap_add(map, fse);
+ slash = strchr(slash + 1, '/');
+ }
+ }
+}
+
+static void initialize_parent_directory_hashmap(struct hashmap *map, struct strbuf *vfs_data)
+{
+ char *buf, *entry;
+ size_t len;
+ int i;
+
+ /*
+ * Build a hashmap of the parent directories contained in the virtual
+ * file system data we can use to look for matches quickly
+ */
+ fsexcludeshash = ignore_case ? memihash : memhash;
+ fsexcludescmp = ignore_case ? strncasecmp : strncmp;
+ hashmap_init(map, fsexcludes_hashmap_cmp, NULL, 0);
+
+ entry = buf = vfs_data->buf;
+ len = vfs_data->len;
+ for (i = 0; i < len; i++) {
+ if (buf[i] == '\0') {
+ parent_directory_hashmap_add(map, entry, buf + i - entry);
+ entry = buf + i + 1;
+ }
+ }
+}
+
+static int check_directory_hashmap(struct hashmap *map, const char *pathname, int pathlen)
+{
+ struct strbuf sb = STRBUF_INIT;
+ struct fsexcludes fse;
+
+ /* Check for directory */
+ strbuf_add(&sb, pathname, pathlen);
+ strbuf_addch(&sb, '/');
+ fse.pattern = sb.buf;
+ fse.patternlen = sb.len;
+ hashmap_entry_init(&fse, fsexcludeshash(fse.pattern, fse.patternlen));
+ if (hashmap_get(map, &fse, NULL)) {
+ strbuf_release(&sb);
+ return 0;
+ }
+
+ strbuf_release(&sb);
+ return 1;
+}
+
+/*
+ * Return 1 for exclude, 0 for include and -1 for undecided.
+ */
+int fsexcludes_is_excluded_from(struct index_state *istate,
+ const char *pathname, int pathlen, int dtype)
+{
+ if (!fsexcludes_initialized)
+ return -1;
+
+ if (dtype == DT_REG) {
+ /* lazily init the hashmap */
+ if (!fsexcludes_hashmap.cmpfn_data)
+ initialize_fsexcludes_hashmap(&fsexcludes_hashmap, &fsexcludes_data);
+
+ return check_fsexcludes_hashmap(&fsexcludes_hashmap, pathname, pathlen);
+ }
+
+ if (dtype == DT_DIR || dtype == DT_LNK) {
+ /* lazily init the hashmap */
+ if (!parent_directory_hashmap.cmpfn_data)
+ initialize_parent_directory_hashmap(&parent_directory_hashmap, &fsexcludes_data);
+
+ return check_directory_hashmap(&parent_directory_hashmap, pathname, pathlen);
+ }
+
+ return -1;
+}
+
+void fsexcludes_init(struct strbuf *sb)
+{
+ fsexcludes_initialized = 1;
+ fsexcludes_data = *sb;
+ strbuf_detach(sb, NULL);
+}
+
+void fsexcludes_free(void) {
+ strbuf_release(&fsexcludes_data);
+ hashmap_free(&fsexcludes_hashmap, 1);
+ hashmap_free(&parent_directory_hashmap, 1);
+ fsexcludes_initialized = 0;
+}
diff --git a/fsexcludes.h b/fsexcludes.h
new file mode 100644
index 0000000000..10246daa02
--- /dev/null
+++ b/fsexcludes.h
@@ -0,0 +1,29 @@
+#ifndef FSEXCLUDES_H
+#define FSEXCLUDES_H
+
+/*
+ * The file system excludes functions provides a way to programatically limit
+ * where git will scan for untracked files. This is used to speed up the
+ * scan by avoiding scanning parts of the work directory that do not have
+ * any new files.
+ */
+
+/*
+ * sb should contain a NUL separated list of path names of the files
+ * and/or directories that should be checked. Any path not listed will
+ * be excluded from the scan.
+ *
+ * NOTE: fsexcludes_init() will take ownership of the storage passed in
+ * sb and will reset sb to `STRBUF_INIT`
+ */
+void fsexcludes_init(struct strbuf *sb);
+void fsexcludes_free(void);
+
+/*
+ * Return 1 for exclude, 0 for include and -1 for undecided.
+ */
+int fsexcludes_is_excluded_from(struct index_state *istate,
+ const char *pathname, int pathlen, int dtype_p);
+
+
+#endif
--
2.17.0.windows.1
next prev parent reply other threads:[~2018-04-13 12:22 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-04-10 21:04 [PATCH v1 0/2] fsexcludes: Add programmatic way to exclude files Ben Peart
2018-04-10 21:04 ` [PATCH v1 1/2] fsexcludes: add a programmatic way to exclude files from git's working directory traversal logic Ben Peart
2018-04-10 22:09 ` Martin Ågren
2018-04-11 19:56 ` Ben Peart
2018-04-11 6:58 ` Junio C Hamano
2018-04-10 21:04 ` [PATCH v1 2/2] fsmonitor: switch to use new fsexcludes logic and remove unused untracked cache based logic Ben Peart
2018-04-11 20:01 ` [PATCH v2 0/2] fsexcludes: Add programmatic way to exclude files Ben Peart
2018-04-11 20:01 ` [PATCH v2 1/2] fsexcludes: add a programmatic way to exclude files from git's working directory traversal logic Ben Peart
2018-04-11 23:52 ` Junio C Hamano
2018-04-13 11:53 ` Ben Peart
2018-04-11 20:01 ` [PATCH v2 2/2] fsmonitor: switch to use new fsexcludes logic and remove unused untracked cache based logic Ben Peart
2018-04-13 12:22 ` [PATCH v3 0/2] fsexcludes: Add programmatic way to exclude files Ben Peart
2018-04-13 12:22 ` Ben Peart [this message]
2018-04-13 12:22 ` [PATCH v3 2/2] fsmonitor: switch to use new fsexcludes logic and remove unused untracked cache based logic Ben Peart
2018-04-18 15:31 ` [PATCH v3 0/2] fsexcludes: Add programmatic way to exclude files Ben Peart
2018-04-18 21:25 ` Junio C Hamano
2018-04-14 15:59 ` [PATCH v1 " Duy Nguyen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://vger.kernel.org/majordomo-info.html
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180413122218.1756-2-benpeart@microsoft.com \
--to=ben.peart@microsoft.com \
--cc=alexmv@dropbox.com \
--cc=avarab@gmail.com \
--cc=blees@dcon.de \
--cc=bmwill@google.com \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
--cc=johannes.schindelin@gmx.de \
--cc=martin.agren@gmail.com \
--cc=pclouds@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/mirrors/git.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).