[PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json

git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed

* [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json
@ 2019-06-24 13:02 Nguyễn Thái Ngọc Duy
  2019-06-24 13:02 ` [PATCH v2 01/10] ls-files: add --json to dump the index Nguyễn Thái Ngọc Duy
                   ` (10 more replies)
  0 siblings, 11 replies; 43+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2019-06-24 13:02 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin,
	Nguyễn Thái Ngọc Duy

v2 main changes:

- --json is renamed --debug-json
- json field names now use '_' instead of '.' to be friendlier to some
  languages. I stick to underscore_name instead of camelCase because
  the former is closer to what we use
- extension location is printed, in case you need to decode the
  extension by yourself (previously only the size is printed)
- all extensions are printed in the same order they appear in the file
  (previously eoie and ieot are printed first because that's how we
  parse)
- resolve undo extension is reorganized a bit to be easier to read
- tests added. Example json files are in t/t3011

Interdiff

diff --git a/Documentation/git-ls-files.txt b/Documentation/git-ls-files.txt
index 54011c8f65..fec5cb7170 100644
--- a/Documentation/git-ls-files.txt
+++ b/Documentation/git-ls-files.txt
@@ -60,11 +60,6 @@ OPTIONS
 --stage::
 	Show staged contents' mode bits, object name and stage number in the output.
 
---json::
-	Dump the entire index content in JSON format. This is for
-	debugging purposes and the JSON structure may change from time
-	to time.
-
 --directory::
 	If a whole directory is classified as "other", show just its
 	name (with a trailing slash) and not its whole contents.
@@ -167,6 +162,11 @@ a space) at the start of each line:
 	possible for manual inspection; the exact format may change at
 	any time.
 
+--debug-json::
+	Dump the entire index content in JSON format. This is for
+	debugging purposes. The JSON structure is subject to change.
+	Note that the strings are not always valid UTF-8.
+
 --eol::
 	Show <eolinfo> and <eolattr> of files.
 	<eolinfo> is the file content identification used by Git when
diff --git a/builtin/ls-files.c b/builtin/ls-files.c
index d00f6d3074..b60cd9ab28 100644
--- a/builtin/ls-files.c
+++ b/builtin/ls-files.c
@@ -545,8 +545,6 @@ int cmd_ls_files(int argc, const char **argv, const char *cmd_prefix)
 			N_("show staged contents' object name in the output")),
 		OPT_BOOL('k', "killed", &show_killed,
 			N_("show files on the filesystem that need to be removed")),
-		OPT_BOOL(0, "json", &show_json,
-			N_("dump index content in json format")),
 		OPT_BIT(0, "directory", &dir.flags,
 			N_("show 'other' directories' names only"),
 			DIR_SHOW_OTHER_DIRECTORIES),
@@ -581,6 +579,8 @@ int cmd_ls_files(int argc, const char **argv, const char *cmd_prefix)
 			N_("pretend that paths removed since <tree-ish> are still present")),
 		OPT__ABBREV(&abbrev),
 		OPT_BOOL(0, "debug", &debug_mode, N_("show debugging data")),
+		OPT_BOOL(0, "debug-json", &show_json,
+			N_("dump index content in JSON format")),
 		OPT_END()
 	};
 
@@ -636,7 +636,7 @@ int cmd_ls_files(int argc, const char **argv, const char *cmd_prefix)
 		    "--error-unmatch");
 
 	parse_pathspec(&pathspec, 0,
-		       PATHSPEC_PREFER_CWD,
+		       show_json ? PATHSPEC_PREFER_FULL : PATHSPEC_PREFER_CWD,
 		       prefix, argv);
 
 	/*
@@ -668,8 +668,14 @@ int cmd_ls_files(int argc, const char **argv, const char *cmd_prefix)
 		show_cached = 1;
 	if (show_json && (show_stage || show_deleted || show_others ||
 			  show_unmerged || show_killed || show_modified ||
-			  show_cached || show_resolve_undo || with_tree))
-		die(_("--show-json cannot be used with other --show- options, or --with-tree"));
+			  show_cached || pathspec.nr))
+		die(_("--debug-json cannot be used with other file selection options"));
+	if (show_json && show_resolve_undo)
+		die(_("--debug-json cannot be used with %s"), "--resolve-undo");
+	if (show_json && with_tree)
+		die(_("--debug-json cannot be used with %s"), "--with-tree");
+	if (show_json && debug_mode)
+		die(_("--debug-json cannot be used with %s"), "--debug");
 
 	if (with_tree) {
 		/*
diff --git a/cache-tree.c b/cache-tree.c
index fc44016fe8..b6a233307e 100644
--- a/cache-tree.c
+++ b/cache-tree.c
@@ -599,18 +599,13 @@ struct cache_tree *cache_tree_read(const char *buffer, unsigned long size,
 	struct cache_tree *ret;
 
 	if (jw) {
-		jw_object_inline_begin_object(jw, "cache-tree");
-		jw_object_intmax(jw, "ext-size", size);
 		jw_object_inline_begin_object(jw, "root");
 	}
 	if (buffer[0])
 		ret = NULL; /* not the whole tree */
 	else
 		ret = read_one(&buffer, &size, jw);
-	if (jw) {
-		jw_end(jw);	/* root */
-		jw_end(jw);	/* cache-tree */
-	}
+	jw_end_gently(jw);
 	return ret;
 }
 
diff --git a/dir.c b/dir.c
index f389eee24a..8808577ea3 100644
--- a/dir.c
+++ b/dir.c
@@ -2902,13 +2902,15 @@ struct untracked_cache *read_untracked_extension(const void *data,
 	uc->exclude_per_dir = xstrdup(exclude_per_dir);
 
 	if (jw) {
-		jw_object_inline_begin_object(jw, "untracked-cache");
-		jw_object_intmax(jw, "ext-size", sz);
 		jw_object_string(jw, "ident", ident);
-		jw_object_oid_stat(jw, "info/exclude", &uc->ss_info_exclude);
-		jw_object_oid_stat(jw, "excludes-file", &uc->ss_excludes_file);
+		jw_object_oid_stat(jw, "info_exclude", &uc->ss_info_exclude);
+		jw_object_oid_stat(jw, "excludes_file", &uc->ss_excludes_file);
 		jw_object_intmax(jw, "flags", uc->dir_flags);
-		jw_object_string(jw, "excludes-per-dir", uc->exclude_per_dir);
+		if (uc->dir_flags & DIR_SHOW_OTHER_DIRECTORIES)
+			jw_object_bool(jw, "show_other_directories", 1);
+		if (uc->dir_flags & DIR_HIDE_EMPTY_DIRECTORIES)
+			jw_object_bool(jw, "hide_empty_directories", 1);
+		jw_object_string(jw, "excludes_per_dir", uc->exclude_per_dir);
 	}
 
 	/* NUL after exclude_per_dir is covered by sizeof(*ouc) */
@@ -2968,7 +2970,6 @@ struct untracked_cache *read_untracked_extension(const void *data,
 		free_untracked_cache(uc);
 		uc = NULL;
 	}
-	jw_end_gently(jw);
 	return uc;
 }
 
diff --git a/fsmonitor.c b/fsmonitor.c
index f6ba437255..5ed55ad176 100644
--- a/fsmonitor.c
+++ b/fsmonitor.c
@@ -52,12 +52,9 @@ int read_fsmonitor_extension(struct index_state *istate, const void *data,
 	istate->fsmonitor_dirty = fsmonitor_dirty;
 
 	if (istate->jw) {
-		jw_object_inline_begin_object(istate->jw, "fsmonitor");
 		jw_object_intmax(istate->jw, "version", hdr_version);
-		jw_object_intmax(istate->jw, "last-update", istate->fsmonitor_last_update);
+		jw_object_intmax(istate->jw, "last_update", istate->fsmonitor_last_update);
 		jw_object_ewah(istate->jw, "dirty", fsmonitor_dirty);
-		jw_object_intmax(istate->jw, "ext-size", sz);
-		jw_end(istate->jw);
 	}
 	trace_printf_key(&trace_fsmonitor, "read fsmonitor extension successful");
 	return 0;
diff --git a/json-writer.c b/json-writer.c
index 70403580ca..c0bd302e4e 100644
--- a/json-writer.c
+++ b/json-writer.c
@@ -203,19 +203,25 @@ void jw_object_null(struct json_writer *jw, const char *key)
 	strbuf_addstr(&jw->json, "null");
 }
 
+void jw_object_filemode(struct json_writer *jw, const char *key, mode_t mode)
+{
+	object_common(jw, key);
+	strbuf_addf(&jw->json, "\"%06o\"", mode);
+}
+
 void jw_object_stat_data(struct json_writer *jw, const char *name,
 			 const struct stat_data *sd)
 {
 	jw_object_inline_begin_object(jw, name);
-	jw_object_intmax(jw, "st_ctime.sec", sd->sd_ctime.sec);
-	jw_object_intmax(jw, "st_ctime.nsec", sd->sd_ctime.nsec);
-	jw_object_intmax(jw, "st_mtime.sec", sd->sd_mtime.sec);
-	jw_object_intmax(jw, "st_mtime.nsec", sd->sd_mtime.nsec);
-	jw_object_intmax(jw, "st_dev", sd->sd_dev);
-	jw_object_intmax(jw, "st_ino", sd->sd_ino);
-	jw_object_intmax(jw, "st_uid", sd->sd_uid);
-	jw_object_intmax(jw, "st_gid", sd->sd_gid);
-	jw_object_intmax(jw, "st_size", sd->sd_size);
+	jw_object_intmax(jw, "ctime_sec", sd->sd_ctime.sec);
+	jw_object_intmax(jw, "ctime_nsec", sd->sd_ctime.nsec);
+	jw_object_intmax(jw, "mtime_sec", sd->sd_mtime.sec);
+	jw_object_intmax(jw, "mtime_nsec", sd->sd_mtime.nsec);
+	jw_object_intmax(jw, "device", sd->sd_dev);
+	jw_object_intmax(jw, "inode", sd->sd_ino);
+	jw_object_intmax(jw, "uid", sd->sd_uid);
+	jw_object_intmax(jw, "gid", sd->sd_gid);
+	jw_object_intmax(jw, "size", sd->sd_size);
 	jw_end(jw);
 }
 
diff --git a/json-writer.h b/json-writer.h
index f778e019a2..07d841d52a 100644
--- a/json-writer.h
+++ b/json-writer.h
@@ -42,8 +42,10 @@
  * of the given strings.
  */
 
+#include "git-compat-util.h"
 #include "strbuf.h"
 
+struct ewah_bitmap;
 struct stat_data;
 
 struct json_writer
@@ -83,6 +85,7 @@ void jw_object_true(struct json_writer *jw, const char *key);
 void jw_object_false(struct json_writer *jw, const char *key);
 void jw_object_bool(struct json_writer *jw, const char *key, int value);
 void jw_object_null(struct json_writer *jw, const char *key);
+void jw_object_filemode(struct json_writer *jw, const char *key, mode_t value);
 void jw_object_stat_data(struct json_writer *jw, const char *key,
 			 const struct stat_data *sd);
 void jw_object_ewah(struct json_writer *jw, const char *key,
diff --git a/read-cache.c b/read-cache.c
index d7d9ce7260..c26edcc9d9 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1693,9 +1693,28 @@ static int verify_hdr(const struct cache_header *hdr, unsigned long size)
 	return 0;
 }
 
+static struct index_entry_offset_table *do_read_ieot_extension(struct index_state *, const char *, uint32_t);
 static int read_index_extension(struct index_state *istate,
-				const char *ext, const char *data, unsigned long sz)
+				const char *map,
+				unsigned long *offset)
 {
+	int ret = 0;
+	const char *ext = map + *offset;
+	uint32_t sz = get_be32(ext + 4);
+	const char *data = ext + 8;
+
+	if (istate->jw) {
+		char buf[5];
+
+		memcpy(buf, ext, 4);
+		buf[4] = '\0';
+		jw_object_inline_begin_object(istate->jw, buf);
+
+		jw_object_intmax(istate->jw, "file_offset", *offset);
+		jw_object_intmax(istate->jw, "ext_size", sz);
+	}
+	*offset += sz + 8;
+
 	switch (CACHE_EXT(ext)) {
 	case CACHE_EXT_TREE:
 		istate->cache_tree = cache_tree_read(data, sz, istate->jw);
@@ -1704,8 +1723,7 @@ static int read_index_extension(struct index_state *istate,
 		istate->resolve_undo = resolve_undo_read(data, sz, istate->jw);
 		break;
 	case CACHE_EXT_LINK:
-		if (read_link_extension(istate, data, sz))
-			return -1;
+		ret = read_link_extension(istate, data, sz);
 		break;
 	case CACHE_EXT_UNTRACKED:
 		istate->untracked = read_untracked_extension(data, sz, istate->jw);
@@ -1714,17 +1732,31 @@ static int read_index_extension(struct index_state *istate,
 		read_fsmonitor_extension(istate, data, sz);
 		break;
 	case CACHE_EXT_ENDOFINDEXENTRIES:
-	case CACHE_EXT_INDEXENTRYOFFSETTABLE:
+		if (istate->jw) {
+			/* must be synchronized with read_eoie_extension() */
+			jw_object_intmax(istate->jw, "offset", get_be32(data));
+			jw_object_string(istate->jw, "oid",
+					 hash_to_hex((const unsigned char*)data + sizeof(uint32_t)));
+		}
 		/* already handled in do_read_index() */
 		break;
+	case CACHE_EXT_INDEXENTRYOFFSETTABLE:
+		if (istate->jw) {
+			free(do_read_ieot_extension(istate, data, sz));
+		} else {
+			/* already handled in do_read_index() */
+		}
+		break;
 	default:
 		if (*ext < 'A' || 'Z' < *ext)
-			return error(_("index uses %.4s extension, which we do not understand"),
+			ret = error(_("index uses %.4s extension, which we do not understand"),
 				     ext);
-		fprintf_ln(stderr, _("ignoring %.4s extension"), ext);
+		else
+			  fprintf_ln(stderr, _("ignoring %.4s extension"), ext);
 		break;
 	}
-	return 0;
+	jw_end_gently(istate->jw);
+	return ret;
 }
 
 static struct cache_entry *create_from_disk(struct mem_pool *ce_mem_pool,
@@ -1911,10 +1943,10 @@ struct index_entry_offset_table
 	struct index_entry_offset entries[FLEX_ARRAY];
 };
 
-static struct index_entry_offset_table *read_ieot_extension(const char *mmap, size_t mmap_size, size_t offset, struct json_writer *jw);
+static struct index_entry_offset_table *read_ieot_extension(struct index_state *istate, const char *mmap, size_t mmap_size, size_t offset);
 static void write_ieot_extension(struct strbuf *sb, struct index_entry_offset_table *ieot);
 
-static size_t read_eoie_extension(const char *mmap, size_t mmap_size, struct json_writer *jw);
+static size_t read_eoie_extension(const char *mmap, size_t mmap_size);
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset);
 
 struct load_index_extensions
@@ -1930,25 +1962,27 @@ static void *load_index_extensions(void *_data)
 {
 	struct load_index_extensions *p = _data;
 	unsigned long src_offset = p->src_offset;
+	int dump_json = 0;
 
 	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
-		/* After an array of active_nr index entries,
+		if (p->istate->jw && !dump_json) {
+			jw_object_inline_begin_object(p->istate->jw, "extensions");
+			dump_json = 1;
+		}
+		/*
+		 * After an array of active_nr index entries,
 		 * there can be arbitrary number of extended
 		 * sections, each of which is prefixed with
 		 * extension name (4-byte) and section length
 		 * in 4-byte network byte order.
 		 */
-		uint32_t extsize = get_be32(p->mmap + src_offset + 4);
-		if (read_index_extension(p->istate,
-					 p->mmap + src_offset,
-					 p->mmap + src_offset + 8,
-					 extsize) < 0) {
+		if (read_index_extension(p->istate, p->mmap, &src_offset) < 0) {
 			munmap((void *)p->mmap, p->mmap_size);
 			die(_("index file corrupt"));
 		}
-		src_offset += 8;
-		src_offset += extsize;
 	}
+	if (dump_json)
+		jw_end(p->istate->jw);
 
 	return NULL;
 }
@@ -1958,7 +1992,6 @@ static void dump_cache_entry(struct index_state *istate,
 			     unsigned long offset,
 			     const struct cache_entry *ce)
 {
-	struct strbuf sb = STRBUF_INIT;
 	struct json_writer *jw = istate->jw;
 
 	jw_array_inline_begin_object(jw);
@@ -1971,28 +2004,28 @@ static void dump_cache_entry(struct index_state *istate,
 
 	jw_object_string(jw, "name", ce->name);
 
-	strbuf_addf(&sb, "%06o", ce->ce_mode);
-	jw_object_string(jw, "mode", sb.buf);
-	strbuf_release(&sb);
+	jw_object_filemode(jw, "mode", ce->ce_mode);
 
 	jw_object_intmax(jw, "flags", ce->ce_flags);
 	/*
 	 * again redundant info, just so you don't have to decode
 	 * flags values manually
 	 */
+	if (ce->ce_flags & CE_EXTENDED)
+		jw_object_true(jw, "extended_flags");
 	if (ce->ce_flags & CE_VALID)
-		jw_object_true(jw, "assume-unchanged");
+		jw_object_true(jw, "assume_unchanged");
 	if (ce->ce_flags & CE_INTENT_TO_ADD)
-		jw_object_true(jw, "intent-to-add");
+		jw_object_true(jw, "intent_to_add");
 	if (ce->ce_flags & CE_SKIP_WORKTREE)
-		jw_object_true(jw, "skip-worktree");
+		jw_object_true(jw, "skip_worktree");
 	if (ce_stage(ce))
 		jw_object_intmax(jw, "stage", ce_stage(ce));
 
 	jw_object_string(jw, "oid", oid_to_hex(&ce->oid));
 
 	jw_object_stat_data(jw, "stat", &ce->ce_stat_data);
-	jw_object_intmax(jw, "file-offset", offset);
+	jw_object_intmax(jw, "file_offset", offset);
 
 	jw_end(jw);
 }
@@ -2232,28 +2265,21 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		nr_threads = 1;
 
 	if (istate->jw) {
-		size_t off;
-
 		jw_object_begin(istate->jw, jw_pretty);
 		jw_object_intmax(istate->jw, "version", istate->version);
 		jw_object_string(istate->jw, "oid", oid_to_hex(&istate->oid));
-		jw_object_intmax(istate->jw, "st_mtime.sec", istate->timestamp.sec);
-		jw_object_intmax(istate->jw, "st_mtime.nsec", istate->timestamp.nsec);
+		jw_object_intmax(istate->jw, "mtime_sec", istate->timestamp.sec);
+		jw_object_intmax(istate->jw, "mtime_nsec", istate->timestamp.nsec);
 
 		/*
 		 * Threading may mess up json writing. This is for
 		 * debugging only, so performance is not a concern.
 		 */
 		nr_threads = 1;
-		/* and dump EOIE/IOET extensions even with threading off */
-		off = read_eoie_extension(mmap, mmap_size, istate->jw);
-		if (off)
-			free(read_ieot_extension(mmap, mmap_size,
-						 off, istate->jw));
 	}
 
 	if (nr_threads > 1) {
-		extension_offset = read_eoie_extension(mmap, mmap_size, NULL);
+		extension_offset = read_eoie_extension(mmap, mmap_size);
 		if (extension_offset) {
 			int err;
 
@@ -2271,7 +2297,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	 * to multi-thread the reading of the cache entries.
 	 */
 	if (extension_offset && nr_threads > 1)
-		ieot = read_ieot_extension(mmap, mmap_size, extension_offset, NULL);
+		ieot = read_ieot_extension(istate, mmap, mmap_size, extension_offset);
 
 	if (ieot) {
 		src_offset += load_cache_entries_threaded(istate, mmap, mmap_size, nr_threads, ieot);
@@ -3511,8 +3537,7 @@ int should_validate_cache_entries(void)
 #define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
 #define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
 
-static size_t read_eoie_extension(const char *mmap, size_t mmap_size,
-				  struct json_writer *jw)
+static size_t read_eoie_extension(const char *mmap, size_t mmap_size)
 {
 	/*
 	 * The end of index entries (EOIE) extension is guaranteed to be last
@@ -3556,12 +3581,6 @@ static size_t read_eoie_extension(const char *mmap, size_t mmap_size,
 		return 0;
 	index += sizeof(uint32_t);
 
-	if (jw) {
-		jw_object_inline_begin_object(jw, "end-of-index");
-		jw_object_intmax(jw, "offset", offset);
-		jw_object_intmax(jw, "ext-size", extsize);
-		jw_object_inline_begin_array(jw, "extensions");
-	}
 	/*
 	 * The hash is computed over extension types and their sizes (but not
 	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
@@ -3590,24 +3609,9 @@ static size_t read_eoie_extension(const char *mmap, size_t mmap_size,
 
 		the_hash_algo->update_fn(&c, mmap + src_offset, 8);
 
-		if (jw) {
-			char name[5];
-
-			jw_array_inline_begin_object(jw);
-			memcpy(name, mmap + src_offset, 4);
-			name[4] = '\0';
-			jw_object_string(jw, "name",  name);
-			jw_object_intmax(jw, "size", extsize);
-			jw_end(jw);
-		}
-
 		src_offset += 8;
 		src_offset += extsize;
 	}
-	if (jw) {
-		jw_end(jw);	/* extensions */
-		jw_end(jw);	/* end-of-index */
-	}
 	the_hash_algo->final_fn(hash, &c);
 	if (!hasheq(hash, (const unsigned char *)index))
 		return 0;
@@ -3636,13 +3640,12 @@ static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context,
 #define IEOT_VERSION	(1)
 
 static struct index_entry_offset_table *read_ieot_extension(
-	const char *mmap, size_t mmap_size,
-	size_t offset, struct json_writer *jw)
+		struct index_state *istate,
+		const char *mmap, size_t mmap_size,
+		size_t offset)
 {
 	const char *index = NULL;
-	uint32_t extsize, ext_version;
-	struct index_entry_offset_table *ieot;
-	int i, nr;
+	uint32_t extsize;
 
 	/* find the IEOT extension */
 	if (!offset)
@@ -3658,6 +3661,17 @@ static struct index_entry_offset_table *read_ieot_extension(
 	}
 	if (!index)
 		return NULL;
+	return do_read_ieot_extension(istate, index, extsize);
+}
+
+static struct index_entry_offset_table *do_read_ieot_extension(
+		struct index_state *istate,
+		const char *index,
+		uint32_t extsize)
+{
+	uint32_t ext_version;
+	struct index_entry_offset_table *ieot;
+	int i, nr;
 
 	/* validate the version is IEOT_VERSION */
 	ext_version = get_be32(index);
@@ -3673,11 +3687,9 @@ static struct index_entry_offset_table *read_ieot_extension(
 		error("invalid number of IEOT entries %d", nr);
 		return NULL;
 	}
-	if (jw) {
-		jw_object_inline_begin_object(jw, "index-entry-offsets");
-		jw_object_intmax(jw, "version", ext_version);
-		jw_object_intmax(jw, "ext-size", extsize);
-		jw_object_inline_begin_array(jw, "entries");
+	if (istate->jw) {
+		jw_object_intmax(istate->jw, "version", ext_version);
+		jw_object_inline_begin_array(istate->jw, "entries");
 	}
 	ieot = xmalloc(sizeof(struct index_entry_offset_table)
 		       + (nr * sizeof(struct index_entry_offset)));
@@ -3688,17 +3700,14 @@ static struct index_entry_offset_table *read_ieot_extension(
 		ieot->entries[i].nr = get_be32(index);
 		index += sizeof(uint32_t);
 
-		if (jw) {
-			jw_array_inline_begin_object(jw);
-			jw_object_intmax(jw, "offset", ieot->entries[i].offset);
-			jw_object_intmax(jw, "count", ieot->entries[i].nr);
-			jw_end(jw);
+		if (istate->jw) {
+			jw_array_inline_begin_object(istate->jw);
+			jw_object_intmax(istate->jw, "offset", ieot->entries[i].offset);
+			jw_object_intmax(istate->jw, "count", ieot->entries[i].nr);
+			jw_end(istate->jw);
 		}
 	}
-	if (jw) {
-		jw_end(jw);	/* entries */
-		jw_end(jw);	/* index-entry-offsets */
-	}
+	jw_end_gently(istate->jw);
 
 	return ieot;
 }
diff --git a/resolve-undo.c b/resolve-undo.c
index 999020bc40..68921e3dfe 100644
--- a/resolve-undo.c
+++ b/resolve-undo.c
@@ -50,6 +50,28 @@ void resolve_undo_write(struct strbuf *sb, struct string_list *resolve_undo)
 	}
 }
 
+static void dump_resolve_undo(struct json_writer *jw,
+			      const char *path,
+			      const struct resolve_undo_info *ui)
+{
+	int i;
+
+	if (!jw)
+		return;
+
+	jw_array_inline_begin_object(jw);
+	jw_object_string(jw, "path", path);
+
+	jw_object_inline_begin_array(jw, "stages");
+	for (i = 0; i < 3; i++) {
+		jw_array_inline_begin_object(jw);
+		jw_object_filemode(jw, "mode", ui->mode[i]);
+		jw_object_string(jw, "oid", oid_to_hex(&ui->oid[i]));
+		jw_end(jw);
+	}
+	jw_end(jw);
+}
+
 struct string_list *resolve_undo_read(const char *data, unsigned long size,
 				      struct json_writer *jw)
 {
@@ -61,11 +83,7 @@ struct string_list *resolve_undo_read(const char *data, unsigned long size,
 
 	resolve_undo = xcalloc(1, sizeof(*resolve_undo));
 	resolve_undo->strdup_strings = 1;
-	if (jw) {
-		jw_object_inline_begin_object(jw, "resolve-undo");
-		jw_object_intmax(jw, "ext-size", size);
-		jw_object_inline_begin_array(jw, "entries");
-	}
+	jw_object_inline_begin_array_gently(jw, "entries");
 
 	while (size) {
 		struct string_list_item *lost;
@@ -102,33 +120,9 @@ struct string_list *resolve_undo_read(const char *data, unsigned long size,
 			data += rawsz;
 		}
 
-		if (jw) {
-			struct strbuf sb = STRBUF_INIT;
-
-			jw_array_inline_begin_object(jw);
-			jw_object_string(jw, "path", lost->string);
-
-			jw_object_inline_begin_array(jw, "mode");
-			for (i = 0; i < 3; i++) {
-				strbuf_addf(&sb, "%06o", ui->mode[i]);
-				jw_array_string(jw, sb.buf);
-				strbuf_reset(&sb);
-			}
-			jw_end(jw);
-
-			jw_object_inline_begin_array(jw, "oid");
-			for (i = 0; i < 3; i++)
-				jw_array_string(jw, oid_to_hex(&ui->oid[i]));
-			jw_end(jw);
-
-			jw_end(jw);
-			strbuf_release(&sb);
-		}
-	}
-	if (jw) {
-		jw_end(jw);	/* entries */
-		jw_end(jw);	/* resolve-undo */
+		dump_resolve_undo(jw, lost->string, ui);
 	}
+	jw_end_gently(jw);
 	return resolve_undo;
 
 error:
diff --git a/split-index.c b/split-index.c
index d7b4420c92..41552bf771 100644
--- a/split-index.c
+++ b/split-index.c
@@ -17,7 +17,6 @@ int read_link_extension(struct index_state *istate,
 {
 	const unsigned char *data = data_;
 	struct split_index *si;
-	unsigned long original_sz = sz;
 	int ret;
 
 	if (sz < the_hash_algo->rawsz)
@@ -42,12 +41,9 @@ int read_link_extension(struct index_state *istate,
 		return error("garbage at the end of link extension");
 done:
 	if (istate->jw) {
-		jw_object_inline_begin_object(istate->jw, "split-index");
 		jw_object_string(istate->jw, "oid", oid_to_hex(&si->base_oid));
-		jw_object_ewah(istate->jw, "delete-bitmap", si->delete_bitmap);
-		jw_object_ewah(istate->jw, "replace-bitmap", si->replace_bitmap);
-		jw_object_intmax(istate->jw, "ext-size", original_sz);
-		jw_end(istate->jw);
+		jw_object_ewah(istate->jw, "delete_bitmap", si->delete_bitmap);
+		jw_object_ewah(istate->jw, "replace_bitmap", si->replace_bitmap);
 	}
 	return 0;
 }
diff --git a/t/t3008-ls-files-lazy-init-name-hash.sh b/t/t3008-ls-files-lazy-init-name-hash.sh
index 64f047332b..7f918c05f6 100755
--- a/t/t3008-ls-files-lazy-init-name-hash.sh
+++ b/t/t3008-ls-files-lazy-init-name-hash.sh
@@ -4,15 +4,9 @@ test_description='Test the lazy init name hash with various folder structures'
 
 . ./test-lib.sh
 
-if test 1 -eq $($GIT_BUILD_DIR/t/helper/test-tool online-cpus)
-then
-	skip_all='skipping lazy-init tests, single cpu'
-	test_done
-fi
-
 LAZY_THREAD_COST=2000
 
-test_expect_success 'no buffer overflow in lazy_init_name_hash' '
+test_expect_success !SINGLE_CPU 'no buffer overflow in lazy_init_name_hash' '
 	(
 	    test_seq $LAZY_THREAD_COST | sed "s/^/a_/" &&
 	    echo b/b/b &&
diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
new file mode 100755
index 0000000000..9f4ad4c9cf
--- /dev/null
+++ b/t/t3011-ls-files-json.sh
@@ -0,0 +1,106 @@
+#!/bin/sh
+
+test_description='ls-files dumping json'
+
+. ./test-lib.sh
+
+strip_number() {
+	for name; do
+		echo 's/\("'$name'":\) [0-9]\+/\1 <number>/' >>filter.sed
+	done
+}
+
+strip_string() {
+	for name; do
+		echo 's/\("'$name'":\) ".*"/\1 <string>/' >>filter.sed
+	done
+}
+
+compare_json() {
+	git ls-files --debug-json >json &&
+	sed -f filter.sed json >filtered &&
+	test_cmp "$TEST_DIRECTORY"/t3011/"$1" filtered
+}
+
+test_expect_success 'setup' '
+	mkdir sub &&
+	echo one >one &&
+	git add one &&
+	echo 2 >sub/two &&
+	git add sub/two &&
+
+	git commit -m first &&
+	git update-index --untracked-cache &&
+
+	echo intent-to-add >ita &&
+	git add -N ita &&
+
+	strip_number ctime_sec ctime_nsec mtime_sec mtime_nsec &&
+	strip_number device inode uid gid file_offset ext_size last_update &&
+	strip_string oid ident
+'
+
+test_expect_success 'ls-files --json, main entries, UNTR and TREE' '
+	compare_json basic
+'
+
+test_expect_success 'ls-files --json, split index' '
+	git init split &&
+	(
+		cd split &&
+		echo one >one &&
+		git add one &&
+		git update-index --split-index &&
+		echo updated >>one &&
+		test_must_fail git -c splitIndex.maxPercentChange=100 update-index --refresh &&
+		cp ../filter.sed . &&
+		compare_json split-index
+	)
+'
+
+test_expect_success 'ls-files --json, fsmonitor extension ' '
+	git init fsmonitor &&
+	(
+		cd fsmonitor &&
+		echo one >one &&
+		git add one &&
+		git update-index --fsmonitor &&
+		cp ../filter.sed . &&
+		compare_json fsmonitor
+	)
+'
+
+test_expect_success 'ls-files --json, rerere extension' '
+	git init rerere &&
+	(
+		cd rerere &&
+		mkdir fi &&
+		test_commit initial fi/le first &&
+		git branch side &&
+		test_commit second fi/le second &&
+		git checkout side &&
+		test_commit third fi/le third &&
+		git checkout master &&
+		git config rerere.enabled true &&
+		test_must_fail git merge side &&
+		echo resolved >fi/le &&
+		git add fi/le &&
+		cp ../filter.sed . &&
+		compare_json rerere
+	)
+'
+
+test_expect_success !SINGLE_CPU 'ls-files --json and multicore extensions' '
+	git init eoie &&
+	(
+		cd eoie &&
+		git config index.threads 2 &&
+		touch one two three four &&
+		git add . &&
+		cp ../filter.sed . &&
+		strip_number offset &&
+		compare_json eoie
+	)
+'
+
+test_done
diff --git a/t/t3011/basic b/t/t3011/basic
new file mode 100644
index 0000000000..8e049f5350
--- /dev/null
+++ b/t/t3011/basic
@@ -0,0 +1,124 @@
+{
+  "version": 3,
+  "oid": <string>,
+  "mtime_sec": <number>,
+  "mtime_nsec": <number>,
+  "entries": [
+    {
+      "id": 0,
+      "name": "ita",
+      "mode": "100644",
+      "flags": 536887296,
+      "extended_flags": true,
+      "intent_to_add": true,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 0
+      },
+      "file_offset": <number>
+    },
+    {
+      "id": 1,
+      "name": "one",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 4
+      },
+      "file_offset": <number>
+    },
+    {
+      "id": 2,
+      "name": "sub/two",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 2
+      },
+      "file_offset": <number>
+    }
+  ],
+  "extensions": {
+    "TREE": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "root": {
+        "oid": null,
+        "subdirs": [
+          {
+            "name": "sub",
+            "oid": <string>,
+            "entry_count": 1,
+            "subdirs": [
+            ]
+          }
+        ]
+      }
+    },
+    "UNTR": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "ident": <string>,
+      "info_exclude": {
+        "valid": true,
+        "oid": <string>,
+        "stat": {
+          "ctime_sec": <number>,
+          "ctime_nsec": <number>,
+          "mtime_sec": <number>,
+          "mtime_nsec": <number>,
+          "device": <number>,
+          "inode": <number>,
+          "uid": <number>,
+          "gid": <number>,
+          "size": 0
+        }
+      },
+      "excludes_file": {
+        "valid": true,
+        "oid": <string>,
+        "stat": {
+          "ctime_sec": <number>,
+          "ctime_nsec": <number>,
+          "mtime_sec": <number>,
+          "mtime_nsec": <number>,
+          "device": <number>,
+          "inode": <number>,
+          "uid": <number>,
+          "gid": <number>,
+          "size": 0
+        }
+      },
+      "flags": 6,
+      "show_other_directories": true,
+      "hide_empty_directories": true,
+      "excludes_per_dir": ".gitignore"
+    }
+  }
+}
diff --git a/t/t3011/eoie b/t/t3011/eoie
new file mode 100644
index 0000000000..66a0feb3b6
--- /dev/null
+++ b/t/t3011/eoie
@@ -0,0 +1,107 @@
+{
+  "version": 2,
+  "oid": <string>,
+  "mtime_sec": <number>,
+  "mtime_nsec": <number>,
+  "entries": [
+    {
+      "id": 0,
+      "name": "four",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 0
+      },
+      "file_offset": <number>
+    },
+    {
+      "id": 1,
+      "name": "one",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 0
+      },
+      "file_offset": <number>
+    },
+    {
+      "id": 2,
+      "name": "three",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 0
+      },
+      "file_offset": <number>
+    },
+    {
+      "id": 3,
+      "name": "two",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 0
+      },
+      "file_offset": <number>
+    }
+  ],
+  "extensions": {
+    "IEOT": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "version": 1,
+      "entries": [
+        {
+          "offset": <number>,
+          "count": 2
+        },
+        {
+          "offset": <number>,
+          "count": 2
+        }
+      ]
+    },
+    "EOIE": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "offset": <number>,
+      "oid": <string>
+    }
+  }
+}
diff --git a/t/t3011/fsmonitor b/t/t3011/fsmonitor
new file mode 100644
index 0000000000..17f2d4a664
--- /dev/null
+++ b/t/t3011/fsmonitor
@@ -0,0 +1,38 @@
+{
+  "version": 2,
+  "oid": <string>,
+  "mtime_sec": <number>,
+  "mtime_nsec": <number>,
+  "entries": [
+    {
+      "id": 0,
+      "name": "one",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 4
+      },
+      "file_offset": <number>
+    }
+  ],
+  "extensions": {
+    "FSMN": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "version": 1,
+      "last_update": <number>,
+      "dirty": [
+        0
+      ]
+    }
+  }
+}
diff --git a/t/t3011/rerere b/t/t3011/rerere
new file mode 100644
index 0000000000..a8ec4b16ee
--- /dev/null
+++ b/t/t3011/rerere
@@ -0,0 +1,66 @@
+{
+  "version": 2,
+  "oid": <string>,
+  "mtime_sec": <number>,
+  "mtime_nsec": <number>,
+  "entries": [
+    {
+      "id": 0,
+      "name": "fi/le",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 9
+      },
+      "file_offset": <number>
+    }
+  ],
+  "extensions": {
+    "TREE": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "root": {
+        "oid": null,
+        "subdirs": [
+          {
+            "name": "fi",
+            "oid": null,
+            "subdirs": [
+            ]
+          }
+        ]
+      }
+    },
+    "REUC": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "entries": [
+        {
+          "path": "fi/le",
+          "stages": [
+            {
+              "mode": "100644",
+              "oid": <string>
+            },
+            {
+              "mode": "100644",
+              "oid": <string>
+            },
+            {
+              "mode": "100644",
+              "oid": <string>
+            }
+          ]
+        }
+      ]
+    }
+  }
diff --git a/t/t3011/split-index b/t/t3011/split-index
new file mode 100644
index 0000000000..cdcc4ddded
--- /dev/null
+++ b/t/t3011/split-index
@@ -0,0 +1,39 @@
+{
+  "version": 2,
+  "oid": <string>,
+  "mtime_sec": <number>,
+  "mtime_nsec": <number>,
+  "entries": [
+    {
+      "id": 0,
+      "name": "",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 4
+      },
+      "file_offset": <number>
+    }
+  ],
+  "extensions": {
+    "link": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "oid": <string>,
+      "delete_bitmap": [
+      ],
+      "replace_bitmap": [
+        0
+      ]
+    }
+  }
+}
diff --git a/t/test-lib.sh b/t/test-lib.sh
index 4b346467df..9d5b273b40 100644
--- a/t/test-lib.sh
+++ b/t/test-lib.sh
@@ -1611,3 +1611,7 @@ test_lazy_prereq REBASE_P '
 test_lazy_prereq FAIL_PREREQS '
 	test -n "$GIT_TEST_FAIL_PREREQS"
 '
+
+test_lazy_prereq SINGLE_CPU '
+	test "$(test-tool online-cpus)" -eq 1
+'

Nguyễn Thái Ngọc Duy (10):
  ls-files: add --json to dump the index
  read-cache.c: dump common extension info in json
  cache-tree.c: dump "TREE" extension as json
  dir.c: dump "UNTR" extension as json
  split-index.c: dump "link" extension as json
  fsmonitor.c: dump "FSMN" extension as json
  resolve-undo.c: dump "REUC" extension as json
  read-cache.c: dump "EOIE" extension as json
  read-cache.c: dump "IEOT" extension as json
  t3008: use the new SINGLE_CPU prereq

 Documentation/git-ls-files.txt          |   5 +
 builtin/ls-files.c                      |  38 ++++-
 cache-tree.c                            |  36 ++++-
 cache-tree.h                            |   5 +-
 cache.h                                 |   2 +
 dir.c                                   |  57 +++++++-
 dir.h                                   |   4 +-
 fsmonitor.c                             |   6 +
 json-writer.c                           |  36 +++++
 json-writer.h                           |  32 +++++
 read-cache.c                            | 178 ++++++++++++++++++++----
 resolve-undo.c                          |  30 +++-
 resolve-undo.h                          |   4 +-
 split-index.c                           |   9 +-
 t/t3008-ls-files-lazy-init-name-hash.sh |   8 +-
 t/t3011-ls-files-json.sh (new +x)       | 106 ++++++++++++++
 t/t3011/basic (new)                     | 124 +++++++++++++++++
 t/t3011/eoie (new)                      | 107 ++++++++++++++
 t/t3011/fsmonitor (new)                 |  38 +++++
 t/t3011/rerere (new)                    |  66 +++++++++
 t/t3011/split-index (new)               |  39 ++++++
 t/test-lib.sh                           |   4 +
 22 files changed, 884 insertions(+), 50 deletions(-)
 create mode 100755 t/t3011-ls-files-json.sh
 create mode 100644 t/t3011/basic
 create mode 100644 t/t3011/eoie
 create mode 100644 t/t3011/fsmonitor
 create mode 100644 t/t3011/rerere
 create mode 100644 t/t3011/split-index

-- 
2.22.0.rc0.322.g2b0371e29a


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v2 01/10] ls-files: add --json to dump the index
  2019-06-24 13:02 [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Nguyễn Thái Ngọc Duy
@ 2019-06-24 13:02 ` Nguyễn Thái Ngọc Duy
  2019-06-24 19:15   ` Jeff Hostetler
                     ` (3 more replies)
  2019-06-24 13:02 ` [PATCH v2 02/10] read-cache.c: dump common extension info in json Nguyễn Thái Ngọc Duy
                   ` (9 subsequent siblings)
  10 siblings, 4 replies; 43+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2019-06-24 13:02 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin,
	Nguyễn Thái Ngọc Duy

So far we don't have a command to basically dump the index file out,
with all its glory details. Checking some info, for example, stat
time, usually involves either writing new code or firing up "xxd" and
decoding values by yourself.

This --json is supposed to help that. It dumps the index in a human
readable format but also easy to be processed with tools. And it will
print almost enough info to reconstruct the index later.

In this patch we only dump the main part, not extensions. But at the
end of the series, the entire index is dumped. The end result could be
very verbose even on a small repository such as git.git.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/git-ls-files.txt    |  5 +++
 builtin/ls-files.c                | 38 +++++++++++++---
 cache.h                           |  2 +
 json-writer.c                     | 22 ++++++++++
 json-writer.h                     | 23 ++++++++++
 read-cache.c                      | 72 ++++++++++++++++++++++++++++++-
 t/t3011-ls-files-json.sh (new +x) | 44 +++++++++++++++++++
 t/t3011/basic (new)               | 67 ++++++++++++++++++++++++++++
 8 files changed, 265 insertions(+), 8 deletions(-)

diff --git a/Documentation/git-ls-files.txt b/Documentation/git-ls-files.txt
index 8461c0e83e..fec5cb7170 100644
--- a/Documentation/git-ls-files.txt
+++ b/Documentation/git-ls-files.txt
@@ -162,6 +162,11 @@ a space) at the start of each line:
 	possible for manual inspection; the exact format may change at
 	any time.
 
+--debug-json::
+	Dump the entire index content in JSON format. This is for
+	debugging purposes. The JSON structure is subject to change.
+	Note that the strings are not always valid UTF-8.
+
 --eol::
 	Show <eolinfo> and <eolattr> of files.
 	<eolinfo> is the file content identification used by Git when
diff --git a/builtin/ls-files.c b/builtin/ls-files.c
index 7f83c9a6f2..b60cd9ab28 100644
--- a/builtin/ls-files.c
+++ b/builtin/ls-files.c
@@ -8,6 +8,7 @@
 #include "cache.h"
 #include "repository.h"
 #include "config.h"
+#include "json-writer.h"
 #include "quote.h"
 #include "dir.h"
 #include "builtin.h"
@@ -31,6 +32,7 @@ static int show_modified;
 static int show_killed;
 static int show_valid_bit;
 static int show_fsmonitor_bit;
+static int show_json;
 static int line_terminator = '\n';
 static int debug_mode;
 static int show_eol;
@@ -577,6 +579,8 @@ int cmd_ls_files(int argc, const char **argv, const char *cmd_prefix)
 			N_("pretend that paths removed since <tree-ish> are still present")),
 		OPT__ABBREV(&abbrev),
 		OPT_BOOL(0, "debug", &debug_mode, N_("show debugging data")),
+		OPT_BOOL(0, "debug-json", &show_json,
+			N_("dump index content in JSON format")),
 		OPT_END()
 	};
 
@@ -632,7 +636,7 @@ int cmd_ls_files(int argc, const char **argv, const char *cmd_prefix)
 		    "--error-unmatch");
 
 	parse_pathspec(&pathspec, 0,
-		       PATHSPEC_PREFER_CWD,
+		       show_json ? PATHSPEC_PREFER_FULL : PATHSPEC_PREFER_CWD,
 		       prefix, argv);
 
 	/*
@@ -660,8 +664,18 @@ int cmd_ls_files(int argc, const char **argv, const char *cmd_prefix)
 
 	/* With no flags, we default to showing the cached files */
 	if (!(show_stage || show_deleted || show_others || show_unmerged ||
-	      show_killed || show_modified || show_resolve_undo))
+	      show_killed || show_modified || show_resolve_undo || show_json))
 		show_cached = 1;
+	if (show_json && (show_stage || show_deleted || show_others ||
+			  show_unmerged || show_killed || show_modified ||
+			  show_cached || pathspec.nr))
+		die(_("--debug-json cannot be used with other file selection options"));
+	if (show_json && show_resolve_undo)
+		die(_("--debug-json cannot be used with %s"), "--resolve-undo");
+	if (show_json && with_tree)
+		die(_("--debug-json cannot be used with %s"), "--with-tree");
+	if (show_json && debug_mode)
+		die(_("--debug-json cannot be used with %s"), "--debug");
 
 	if (with_tree) {
 		/*
@@ -673,10 +687,22 @@ int cmd_ls_files(int argc, const char **argv, const char *cmd_prefix)
 		overlay_tree_on_index(the_repository->index, with_tree, max_prefix);
 	}
 
-	show_files(the_repository, &dir);
-
-	if (show_resolve_undo)
-		show_ru_info(the_repository->index);
+	if (!show_json) {
+		show_files(the_repository, &dir);
+
+		if (show_resolve_undo)
+			show_ru_info(the_repository->index);
+	} else {
+		struct json_writer jw = JSON_WRITER_INIT;
+
+		discard_index(the_repository->index);
+		the_repository->index->jw = &jw;
+		if (repo_read_index(the_repository) < 0)
+			die("index file corrupt");
+		puts(jw.json.buf);
+		the_repository->index->jw = NULL;
+		jw_release(&jw);
+	}
 
 	if (ps_matched) {
 		int bad;
diff --git a/cache.h b/cache.h
index bf20337ef4..84d0aeed20 100644
--- a/cache.h
+++ b/cache.h
@@ -326,6 +326,7 @@ static inline unsigned int canon_mode(unsigned int mode)
 #define UNTRACKED_CHANGED	(1 << 7)
 #define FSMONITOR_CHANGED	(1 << 8)
 
+struct json_writer;
 struct split_index;
 struct untracked_cache;
 
@@ -350,6 +351,7 @@ struct index_state {
 	uint64_t fsmonitor_last_update;
 	struct ewah_bitmap *fsmonitor_dirty;
 	struct mem_pool *ce_mem_pool;
+	struct json_writer *jw;
 };
 
 /* Name hashing */
diff --git a/json-writer.c b/json-writer.c
index aadb9dbddc..0608726512 100644
--- a/json-writer.c
+++ b/json-writer.c
@@ -202,6 +202,28 @@ void jw_object_null(struct json_writer *jw, const char *key)
 	strbuf_addstr(&jw->json, "null");
 }
 
+void jw_object_filemode(struct json_writer *jw, const char *key, mode_t mode)
+{
+	object_common(jw, key);
+	strbuf_addf(&jw->json, "\"%06o\"", mode);
+}
+
+void jw_object_stat_data(struct json_writer *jw, const char *name,
+			 const struct stat_data *sd)
+{
+	jw_object_inline_begin_object(jw, name);
+	jw_object_intmax(jw, "ctime_sec", sd->sd_ctime.sec);
+	jw_object_intmax(jw, "ctime_nsec", sd->sd_ctime.nsec);
+	jw_object_intmax(jw, "mtime_sec", sd->sd_mtime.sec);
+	jw_object_intmax(jw, "mtime_nsec", sd->sd_mtime.nsec);
+	jw_object_intmax(jw, "device", sd->sd_dev);
+	jw_object_intmax(jw, "inode", sd->sd_ino);
+	jw_object_intmax(jw, "uid", sd->sd_uid);
+	jw_object_intmax(jw, "gid", sd->sd_gid);
+	jw_object_intmax(jw, "size", sd->sd_size);
+	jw_end(jw);
+}
+
 static void increase_indent(struct strbuf *sb,
 			    const struct json_writer *jw,
 			    int indent)
diff --git a/json-writer.h b/json-writer.h
index 83906b09c1..c48c4cbf33 100644
--- a/json-writer.h
+++ b/json-writer.h
@@ -42,8 +42,11 @@
  * of the given strings.
  */
 
+#include "git-compat-util.h"
 #include "strbuf.h"
 
+struct stat_data;
+
 struct json_writer
 {
 	/*
@@ -81,6 +84,9 @@ void jw_object_true(struct json_writer *jw, const char *key);
 void jw_object_false(struct json_writer *jw, const char *key);
 void jw_object_bool(struct json_writer *jw, const char *key, int value);
 void jw_object_null(struct json_writer *jw, const char *key);
+void jw_object_filemode(struct json_writer *jw, const char *key, mode_t value);
+void jw_object_stat_data(struct json_writer *jw, const char *key,
+			 const struct stat_data *sd);
 void jw_object_sub_jw(struct json_writer *jw, const char *key,
 		      const struct json_writer *value);
 
@@ -104,4 +110,21 @@ void jw_array_inline_begin_array(struct json_writer *jw);
 int jw_is_terminated(const struct json_writer *jw);
 void jw_end(struct json_writer *jw);
 
+/*
+ * These _gently versions accept NULL json_writer to reduce too much
+ * branching at the call site.
+ */
+static inline void jw_object_inline_begin_array_gently(struct json_writer *jw,
+						       const char *name)
+{
+	if (jw)
+		jw_object_inline_begin_array(jw, name);
+}
+
+static inline void jw_end_gently(struct json_writer *jw)
+{
+	if (jw)
+		jw_end(jw);
+}
+
 #endif /* JSON_WRITER_H */
diff --git a/read-cache.c b/read-cache.c
index 4dd22f4f6e..db5147d088 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -25,6 +25,7 @@
 #include "fsmonitor.h"
 #include "thread-utils.h"
 #include "progress.h"
+#include "json-writer.h"
 
 /* Mask for the name length in ce_flags in the on-disk index */
 
@@ -1952,6 +1953,49 @@ static void *load_index_extensions(void *_data)
 	return NULL;
 }
 
+static void dump_cache_entry(struct index_state *istate,
+			     int index,
+			     unsigned long offset,
+			     const struct cache_entry *ce)
+{
+	struct json_writer *jw = istate->jw;
+
+	jw_array_inline_begin_object(jw);
+
+	/*
+	 * this is technically redundant, but it's for easier
+	 * navigation when there hundreds of entries
+	 */
+	jw_object_intmax(jw, "id", index);
+
+	jw_object_string(jw, "name", ce->name);
+
+	jw_object_filemode(jw, "mode", ce->ce_mode);
+
+	jw_object_intmax(jw, "flags", ce->ce_flags);
+	/*
+	 * again redundant info, just so you don't have to decode
+	 * flags values manually
+	 */
+	if (ce->ce_flags & CE_EXTENDED)
+		jw_object_true(jw, "extended_flags");
+	if (ce->ce_flags & CE_VALID)
+		jw_object_true(jw, "assume_unchanged");
+	if (ce->ce_flags & CE_INTENT_TO_ADD)
+		jw_object_true(jw, "intent_to_add");
+	if (ce->ce_flags & CE_SKIP_WORKTREE)
+		jw_object_true(jw, "skip_worktree");
+	if (ce_stage(ce))
+		jw_object_intmax(jw, "stage", ce_stage(ce));
+
+	jw_object_string(jw, "oid", oid_to_hex(&ce->oid));
+
+	jw_object_stat_data(jw, "stat", &ce->ce_stat_data);
+	jw_object_intmax(jw, "file_offset", offset);
+
+	jw_end(jw);
+}
+
 /*
  * A helper function that will load the specified range of cache entries
  * from the memory mapped file and add them to the given index.
@@ -1972,6 +2016,9 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
+		if (istate->jw)
+			dump_cache_entry(istate, i, src_offset, ce);
+
 		src_offset += consumed;
 		previous_ce = ce;
 	}
@@ -1983,6 +2030,8 @@ static unsigned long load_all_cache_entries(struct index_state *istate,
 {
 	unsigned long consumed;
 
+	jw_object_inline_begin_array_gently(istate->jw, "entries");
+
 	if (istate->version == 4) {
 		mem_pool_init(&istate->ce_mem_pool,
 				estimate_cache_size_from_compressed(istate->cache_nr));
@@ -1993,6 +2042,8 @@ static unsigned long load_all_cache_entries(struct index_state *istate,
 
 	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
 					0, istate->cache_nr, mmap, src_offset, NULL);
+
+	jw_end_gently(istate->jw);
 	return consumed;
 }
 
@@ -2120,6 +2171,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	size_t extension_offset = 0;
 	int nr_threads, cpus;
 	struct index_entry_offset_table *ieot = NULL;
+	int jw_pretty = 1;
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -2154,6 +2206,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache_nr = ntohl(hdr->hdr_entries);
 	istate->cache_alloc = alloc_nr(istate->cache_nr);
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
+	istate->timestamp.sec = st.st_mtime;
+	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 	istate->initialized = 1;
 
 	p.istate = istate;
@@ -2176,6 +2230,20 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	if (!HAVE_THREADS)
 		nr_threads = 1;
 
+	if (istate->jw) {
+		jw_object_begin(istate->jw, jw_pretty);
+		jw_object_intmax(istate->jw, "version", istate->version);
+		jw_object_string(istate->jw, "oid", oid_to_hex(&istate->oid));
+		jw_object_intmax(istate->jw, "mtime_sec", istate->timestamp.sec);
+		jw_object_intmax(istate->jw, "mtime_nsec", istate->timestamp.nsec);
+
+		/*
+		 * Threading may mess up json writing. This is for
+		 * debugging only, so performance is not a concern.
+		 */
+		nr_threads = 1;
+	}
+
 	if (nr_threads > 1) {
 		extension_offset = read_eoie_extension(mmap, mmap_size);
 		if (extension_offset) {
@@ -2204,8 +2272,6 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
 	}
 
-	istate->timestamp.sec = st.st_mtime;
-	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
 	/* if we created a thread, join it otherwise load the extensions on the primary thread */
 	if (extension_offset) {
@@ -2216,6 +2282,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		p.src_offset = src_offset;
 		load_index_extensions(&p);
 	}
+	jw_end_gently(istate->jw);
+
 	munmap((void *)mmap, mmap_size);
 
 	/*
diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
new file mode 100755
index 0000000000..97bcd814be
--- /dev/null
+++ b/t/t3011-ls-files-json.sh
@@ -0,0 +1,44 @@
+#!/bin/sh
+
+test_description='ls-files dumping json'
+
+. ./test-lib.sh
+
+strip_number() {
+	for name; do
+		echo 's/\("'$name'":\) [0-9]\+/\1 <number>/' >>filter.sed
+	done
+}
+
+strip_string() {
+	for name; do
+		echo 's/\("'$name'":\) ".*"/\1 <string>/' >>filter.sed
+	done
+}
+
+compare_json() {
+	git ls-files --debug-json >json &&
+	sed -f filter.sed json >filtered &&
+	test_cmp "$TEST_DIRECTORY"/t3011/"$1" filtered
+}
+
+test_expect_success 'setup' '
+	mkdir sub &&
+	echo one >one &&
+	git add one &&
+	echo 2 >sub/two &&
+	git add sub/two &&
+
+	echo intent-to-add >ita &&
+	git add -N ita &&
+
+	strip_number ctime_sec ctime_nsec mtime_sec mtime_nsec &&
+	strip_number device inode uid gid file_offset ext_size &&
+	strip_string oid ident
+'
+
+test_expect_success 'ls-files --json, main entries' '
+	compare_json basic
+'
+
+test_done
diff --git a/t/t3011/basic b/t/t3011/basic
new file mode 100644
index 0000000000..9436445d90
--- /dev/null
+++ b/t/t3011/basic
@@ -0,0 +1,67 @@
+{
+  "version": 3,
+  "oid": <string>,
+  "mtime_sec": <number>,
+  "mtime_nsec": <number>,
+  "entries": [
+    {
+      "id": 0,
+      "name": "ita",
+      "mode": "100644",
+      "flags": 536887296,
+      "extended_flags": true,
+      "intent_to_add": true,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 0
+      },
+      "file_offset": <number>
+    },
+    {
+      "id": 1,
+      "name": "one",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 4
+      },
+      "file_offset": <number>
+    },
+    {
+      "id": 2,
+      "name": "sub/two",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 2
+      },
+      "file_offset": <number>
+    }
+  ]
+}
-- 
2.22.0.rc0.322.g2b0371e29a


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v2 02/10] read-cache.c: dump common extension info in json
  2019-06-24 13:02 [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Nguyễn Thái Ngọc Duy
  2019-06-24 13:02 ` [PATCH v2 01/10] ls-files: add --json to dump the index Nguyễn Thái Ngọc Duy
@ 2019-06-24 13:02 ` Nguyễn Thái Ngọc Duy
  2019-06-24 13:02 ` [PATCH v2 03/10] cache-tree.c: dump "TREE" extension as json Nguyễn Thái Ngọc Duy
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 43+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2019-06-24 13:02 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin,
	Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 read-cache.c | 49 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index db5147d088..4accd8bb08 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1694,8 +1694,26 @@ static int verify_hdr(const struct cache_header *hdr, unsigned long size)
 }
 
 static int read_index_extension(struct index_state *istate,
-				const char *ext, const char *data, unsigned long sz)
+				const char *map,
+				unsigned long *offset)
 {
+	int ret = 0;
+	const char *ext = map + *offset;
+	uint32_t sz = get_be32(ext + 4);
+	const char *data = ext + 8;
+
+	if (istate->jw) {
+		char buf[5];
+
+		memcpy(buf, ext, 4);
+		buf[4] = '\0';
+		jw_object_inline_begin_object(istate->jw, buf);
+
+		jw_object_intmax(istate->jw, "file_offset", *offset);
+		jw_object_intmax(istate->jw, "ext_size", sz);
+	}
+	*offset += sz + 8;
+
 	switch (CACHE_EXT(ext)) {
 	case CACHE_EXT_TREE:
 		istate->cache_tree = cache_tree_read(data, sz);
@@ -1704,8 +1722,7 @@ static int read_index_extension(struct index_state *istate,
 		istate->resolve_undo = resolve_undo_read(data, sz);
 		break;
 	case CACHE_EXT_LINK:
-		if (read_link_extension(istate, data, sz))
-			return -1;
+		ret = read_link_extension(istate, data, sz);
 		break;
 	case CACHE_EXT_UNTRACKED:
 		istate->untracked = read_untracked_extension(data, sz);
@@ -1719,12 +1736,14 @@ static int read_index_extension(struct index_state *istate,
 		break;
 	default:
 		if (*ext < 'A' || 'Z' < *ext)
-			return error(_("index uses %.4s extension, which we do not understand"),
+			ret = error(_("index uses %.4s extension, which we do not understand"),
 				     ext);
-		fprintf_ln(stderr, _("ignoring %.4s extension"), ext);
+		else
+			  fprintf_ln(stderr, _("ignoring %.4s extension"), ext);
 		break;
 	}
-	return 0;
+	jw_end_gently(istate->jw);
+	return ret;
 }
 
 static struct cache_entry *create_from_disk(struct mem_pool *ce_mem_pool,
@@ -1930,25 +1949,27 @@ static void *load_index_extensions(void *_data)
 {
 	struct load_index_extensions *p = _data;
 	unsigned long src_offset = p->src_offset;
+	int dump_json = 0;
 
 	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
-		/* After an array of active_nr index entries,
+		if (p->istate->jw && !dump_json) {
+			jw_object_inline_begin_object(p->istate->jw, "extensions");
+			dump_json = 1;
+		}
+		/*
+		 * After an array of active_nr index entries,
 		 * there can be arbitrary number of extended
 		 * sections, each of which is prefixed with
 		 * extension name (4-byte) and section length
 		 * in 4-byte network byte order.
 		 */
-		uint32_t extsize = get_be32(p->mmap + src_offset + 4);
-		if (read_index_extension(p->istate,
-					 p->mmap + src_offset,
-					 p->mmap + src_offset + 8,
-					 extsize) < 0) {
+		if (read_index_extension(p->istate, p->mmap, &src_offset) < 0) {
 			munmap((void *)p->mmap, p->mmap_size);
 			die(_("index file corrupt"));
 		}
-		src_offset += 8;
-		src_offset += extsize;
 	}
+	if (dump_json)
+		jw_end(p->istate->jw);
 
 	return NULL;
 }
-- 
2.22.0.rc0.322.g2b0371e29a


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v2 03/10] cache-tree.c: dump "TREE" extension as json
  2019-06-24 13:02 [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Nguyễn Thái Ngọc Duy
  2019-06-24 13:02 ` [PATCH v2 01/10] ls-files: add --json to dump the index Nguyễn Thái Ngọc Duy
  2019-06-24 13:02 ` [PATCH v2 02/10] read-cache.c: dump common extension info in json Nguyễn Thái Ngọc Duy
@ 2019-06-24 13:02 ` Nguyễn Thái Ngọc Duy
  2019-06-24 13:02 ` [PATCH v2 04/10] dir.c: dump "UNTR" " Nguyễn Thái Ngọc Duy
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 43+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2019-06-24 13:02 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin,
	Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 cache-tree.c             | 36 +++++++++++++++++++++++++++++++-----
 cache-tree.h             |  5 ++++-
 read-cache.c             |  2 +-
 t/t3011-ls-files-json.sh |  4 +++-
 t/t3011/basic            | 20 +++++++++++++++++++-
 5 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/cache-tree.c b/cache-tree.c
index b13bfaf71e..b6a233307e 100644
--- a/cache-tree.c
+++ b/cache-tree.c
@@ -3,6 +3,7 @@
 #include "tree.h"
 #include "tree-walk.h"
 #include "cache-tree.h"
+#include "json-writer.h"
 #include "object-store.h"
 #include "replace-object.h"
 
@@ -492,7 +493,8 @@ void cache_tree_write(struct strbuf *sb, struct cache_tree *root)
 	write_one(sb, root, "", 0);
 }
 
-static struct cache_tree *read_one(const char **buffer, unsigned long *size_p)
+static struct cache_tree *read_one(const char **buffer, unsigned long *size_p,
+				   struct json_writer *jw)
 {
 	const char *buf = *buffer;
 	unsigned long size = *size_p;
@@ -546,6 +548,15 @@ static struct cache_tree *read_one(const char **buffer, unsigned long *size_p)
 			*buffer, subtree_nr);
 #endif
 
+	if (jw) {
+		if (it->entry_count >= 0) {
+			jw_object_string(jw, "oid", oid_to_hex(&it->oid));
+			jw_object_intmax(jw, "entry_count", it->entry_count);
+		} else {
+			jw_object_null(jw, "oid");
+		}
+		jw_object_inline_begin_array(jw, "subdirs");
+	}
 	/*
 	 * Just a heuristic -- we do not add directories that often but
 	 * we do not want to have to extend it immediately when we do,
@@ -559,12 +570,18 @@ static struct cache_tree *read_one(const char **buffer, unsigned long *size_p)
 		struct cache_tree_sub *subtree;
 		const char *name = buf;
 
-		sub = read_one(&buf, &size);
+		if (jw) {
+			jw_array_inline_begin_object(jw);
+			jw_object_string(jw, "name", name);
+		}
+		sub = read_one(&buf, &size, jw);
+		jw_end_gently(jw);
 		if (!sub)
 			goto free_return;
 		subtree = cache_tree_sub(it, name);
 		subtree->cache_tree = sub;
 	}
+	jw_end_gently(jw);
 	if (subtree_nr != it->subtree_nr)
 		die("cache-tree: internal error");
 	*buffer = buf;
@@ -576,11 +593,20 @@ static struct cache_tree *read_one(const char **buffer, unsigned long *size_p)
 	return NULL;
 }
 
-struct cache_tree *cache_tree_read(const char *buffer, unsigned long size)
+struct cache_tree *cache_tree_read(const char *buffer, unsigned long size,
+				   struct json_writer *jw)
 {
+	struct cache_tree *ret;
+
+	if (jw) {
+		jw_object_inline_begin_object(jw, "root");
+	}
 	if (buffer[0])
-		return NULL; /* not the whole tree */
-	return read_one(&buffer, &size);
+		ret = NULL; /* not the whole tree */
+	else
+		ret = read_one(&buffer, &size, jw);
+	jw_end_gently(jw);
+	return ret;
 }
 
 static struct cache_tree *cache_tree_find(struct cache_tree *it, const char *path)
diff --git a/cache-tree.h b/cache-tree.h
index 757bbc48bc..fc3c73284b 100644
--- a/cache-tree.h
+++ b/cache-tree.h
@@ -6,6 +6,8 @@
 #include "tree-walk.h"
 
 struct cache_tree;
+struct json_writer;
+
 struct cache_tree_sub {
 	struct cache_tree *cache_tree;
 	int count;		/* internally used by update_one() */
@@ -28,7 +30,8 @@ void cache_tree_invalidate_path(struct index_state *, const char *);
 struct cache_tree_sub *cache_tree_sub(struct cache_tree *, const char *);
 
 void cache_tree_write(struct strbuf *, struct cache_tree *root);
-struct cache_tree *cache_tree_read(const char *buffer, unsigned long size);
+struct cache_tree *cache_tree_read(const char *buffer, unsigned long size,
+				   struct json_writer *jw);
 
 int cache_tree_fully_valid(struct cache_tree *);
 int cache_tree_update(struct index_state *, int);
diff --git a/read-cache.c b/read-cache.c
index 4accd8bb08..d09ce42b9a 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1716,7 +1716,7 @@ static int read_index_extension(struct index_state *istate,
 
 	switch (CACHE_EXT(ext)) {
 	case CACHE_EXT_TREE:
-		istate->cache_tree = cache_tree_read(data, sz);
+		istate->cache_tree = cache_tree_read(data, sz, istate->jw);
 		break;
 	case CACHE_EXT_RESOLVE_UNDO:
 		istate->resolve_undo = resolve_undo_read(data, sz);
diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
index 97bcd814be..fc313f2c9a 100755
--- a/t/t3011-ls-files-json.sh
+++ b/t/t3011-ls-files-json.sh
@@ -29,6 +29,8 @@ test_expect_success 'setup' '
 	echo 2 >sub/two &&
 	git add sub/two &&
 
+	git commit -m first &&
+
 	echo intent-to-add >ita &&
 	git add -N ita &&
 
@@ -37,7 +39,7 @@ test_expect_success 'setup' '
 	strip_string oid ident
 '
 
-test_expect_success 'ls-files --json, main entries' '
+test_expect_success 'ls-files --json, main entries and TREE' '
 	compare_json basic
 '
 
diff --git a/t/t3011/basic b/t/t3011/basic
index 9436445d90..e27f5be5ff 100644
--- a/t/t3011/basic
+++ b/t/t3011/basic
@@ -63,5 +63,23 @@
       },
       "file_offset": <number>
     }
-  ]
+  ],
+  "extensions": {
+    "TREE": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "root": {
+        "oid": null,
+        "subdirs": [
+          {
+            "name": "sub",
+            "oid": <string>,
+            "entry_count": 1,
+            "subdirs": [
+            ]
+          }
+        ]
+      }
+    }
+  }
 }
-- 
2.22.0.rc0.322.g2b0371e29a


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v2 04/10] dir.c: dump "UNTR" extension as json
  2019-06-24 13:02 [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Nguyễn Thái Ngọc Duy
                   ` (2 preceding siblings ...)
  2019-06-24 13:02 ` [PATCH v2 03/10] cache-tree.c: dump "TREE" extension as json Nguyễn Thái Ngọc Duy
@ 2019-06-24 13:02 ` Nguyễn Thái Ngọc Duy
  2019-06-24 19:32   ` Jeff Hostetler
  2019-06-24 13:02 ` [PATCH v2 05/10] split-index.c: dump "link" " Nguyễn Thái Ngọc Duy
                   ` (6 subsequent siblings)
  10 siblings, 1 reply; 43+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2019-06-24 13:02 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin,
	Nguyễn Thái Ngọc Duy

The big part of UNTR extension is dumped at the end instead of dumping
as soon as we read it, because we actually "patch" some fields in
untracked_cache_dir with EWAH bitmaps at the end.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 dir.c                    | 57 +++++++++++++++++++++++++++++++++++++++-
 dir.h                    |  4 ++-
 json-writer.h            |  6 +++++
 read-cache.c             |  2 +-
 t/t3011-ls-files-json.sh |  3 ++-
 t/t3011/basic            | 39 +++++++++++++++++++++++++++
 6 files changed, 107 insertions(+), 4 deletions(-)

diff --git a/dir.c b/dir.c
index ba4a51c296..8808577ea3 100644
--- a/dir.c
+++ b/dir.c
@@ -19,6 +19,7 @@
 #include "varint.h"
 #include "ewah/ewok.h"
 #include "fsmonitor.h"
+#include "json-writer.h"
 #include "submodule-config.h"
 
 /*
@@ -2826,7 +2827,42 @@ static void load_oid_stat(struct oid_stat *oid_stat, const unsigned char *data,
 	oid_stat->valid = 1;
 }
 
-struct untracked_cache *read_untracked_extension(const void *data, unsigned long sz)
+static void jw_object_oid_stat(struct json_writer *jw, const char *key,
+			       const struct oid_stat *oid_stat)
+{
+	jw_object_inline_begin_object(jw, key);
+	jw_object_bool(jw, "valid", oid_stat->valid);
+	jw_object_string(jw, "oid", oid_to_hex(&oid_stat->oid));
+	jw_object_stat_data(jw, "stat", &oid_stat->stat);
+	jw_end(jw);
+}
+
+static void jw_object_untracked_cache_dir(struct json_writer *jw,
+					  const struct untracked_cache_dir *ucd)
+{
+	int i;
+
+	jw_object_bool(jw, "valid", ucd->valid);
+	jw_object_bool(jw, "check-only", ucd->check_only);
+	jw_object_stat_data(jw, "stat", &ucd->stat_data);
+	jw_object_string(jw, "exclude-oid", oid_to_hex(&ucd->exclude_oid));
+	jw_object_inline_begin_array(jw, "untracked");
+	for (i = 0; i < ucd->untracked_nr; i++)
+		jw_array_string(jw, ucd->untracked[i]);
+	jw_end(jw);
+
+	jw_object_inline_begin_object(jw, "dirs");
+	for (i = 0; i < ucd->dirs_nr; i++) {
+		jw_object_inline_begin_object(jw, ucd->dirs[i]->name);
+		jw_object_untracked_cache_dir(jw, ucd->dirs[i]);
+		jw_end(jw);
+	}
+	jw_end(jw);
+}
+
+struct untracked_cache *read_untracked_extension(const void *data,
+						 unsigned long sz,
+						 struct json_writer *jw)
 {
 	struct untracked_cache *uc;
 	struct read_data rd;
@@ -2864,6 +2900,19 @@ struct untracked_cache *read_untracked_extension(const void *data, unsigned long
 	uc->dir_flags = get_be32(next + ouc_offset(dir_flags));
 	exclude_per_dir = (const char *)next + exclude_per_dir_offset;
 	uc->exclude_per_dir = xstrdup(exclude_per_dir);
+
+	if (jw) {
+		jw_object_string(jw, "ident", ident);
+		jw_object_oid_stat(jw, "info_exclude", &uc->ss_info_exclude);
+		jw_object_oid_stat(jw, "excludes_file", &uc->ss_excludes_file);
+		jw_object_intmax(jw, "flags", uc->dir_flags);
+		if (uc->dir_flags & DIR_SHOW_OTHER_DIRECTORIES)
+			jw_object_bool(jw, "show_other_directories", 1);
+		if (uc->dir_flags & DIR_HIDE_EMPTY_DIRECTORIES)
+			jw_object_bool(jw, "hide_empty_directories", 1);
+		jw_object_string(jw, "excludes_per_dir", uc->exclude_per_dir);
+	}
+
 	/* NUL after exclude_per_dir is covered by sizeof(*ouc) */
 	next += exclude_per_dir_offset + strlen(exclude_per_dir) + 1;
 	if (next >= end)
@@ -2905,6 +2954,12 @@ struct untracked_cache *read_untracked_extension(const void *data, unsigned long
 	ewah_each_bit(rd.sha1_valid, read_oid, &rd);
 	next = rd.data;
 
+	if (jw) {
+		jw_object_inline_begin_object(jw, "root");
+		jw_object_untracked_cache_dir(jw, uc->root);
+		jw_end(jw);
+	}
+
 done:
 	free(rd.ucd);
 	ewah_free(rd.valid);
diff --git a/dir.h b/dir.h
index 680079bbe3..80efdd05c4 100644
--- a/dir.h
+++ b/dir.h
@@ -6,6 +6,8 @@
 #include "cache.h"
 #include "strbuf.h"
 
+struct json_writer;
+
 struct dir_entry {
 	unsigned int len;
 	char name[FLEX_ARRAY]; /* more */
@@ -362,7 +364,7 @@ void untracked_cache_remove_from_index(struct index_state *, const char *);
 void untracked_cache_add_to_index(struct index_state *, const char *);
 
 void free_untracked_cache(struct untracked_cache *);
-struct untracked_cache *read_untracked_extension(const void *data, unsigned long sz);
+struct untracked_cache *read_untracked_extension(const void *data, unsigned long sz, struct json_writer *jw);
 void write_untracked_extension(struct strbuf *out, struct untracked_cache *untracked);
 void add_untracked_cache(struct index_state *istate);
 void remove_untracked_cache(struct index_state *istate);
diff --git a/json-writer.h b/json-writer.h
index c48c4cbf33..c3d0fbd1ef 100644
--- a/json-writer.h
+++ b/json-writer.h
@@ -121,6 +121,12 @@ static inline void jw_object_inline_begin_array_gently(struct json_writer *jw,
 		jw_object_inline_begin_array(jw, name);
 }
 
+static inline void jw_array_inline_begin_object_gently(struct json_writer *jw)
+{
+	if (jw)
+		jw_array_inline_begin_object(jw);
+}
+
 static inline void jw_end_gently(struct json_writer *jw)
 {
 	if (jw)
diff --git a/read-cache.c b/read-cache.c
index d09ce42b9a..a70df4b0a5 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1725,7 +1725,7 @@ static int read_index_extension(struct index_state *istate,
 		ret = read_link_extension(istate, data, sz);
 		break;
 	case CACHE_EXT_UNTRACKED:
-		istate->untracked = read_untracked_extension(data, sz);
+		istate->untracked = read_untracked_extension(data, sz, istate->jw);
 		break;
 	case CACHE_EXT_FSMONITOR:
 		read_fsmonitor_extension(istate, data, sz);
diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
index fc313f2c9a..082fe8e966 100755
--- a/t/t3011-ls-files-json.sh
+++ b/t/t3011-ls-files-json.sh
@@ -30,6 +30,7 @@ test_expect_success 'setup' '
 	git add sub/two &&
 
 	git commit -m first &&
+	git update-index --untracked-cache &&
 
 	echo intent-to-add >ita &&
 	git add -N ita &&
@@ -39,7 +40,7 @@ test_expect_success 'setup' '
 	strip_string oid ident
 '
 
-test_expect_success 'ls-files --json, main entries and TREE' '
+test_expect_success 'ls-files --json, main entries, UNTR and TREE' '
 	compare_json basic
 '
 
diff --git a/t/t3011/basic b/t/t3011/basic
index e27f5be5ff..8e049f5350 100644
--- a/t/t3011/basic
+++ b/t/t3011/basic
@@ -80,6 +80,45 @@
           }
         ]
       }
+    },
+    "UNTR": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "ident": <string>,
+      "info_exclude": {
+        "valid": true,
+        "oid": <string>,
+        "stat": {
+          "ctime_sec": <number>,
+          "ctime_nsec": <number>,
+          "mtime_sec": <number>,
+          "mtime_nsec": <number>,
+          "device": <number>,
+          "inode": <number>,
+          "uid": <number>,
+          "gid": <number>,
+          "size": 0
+        }
+      },
+      "excludes_file": {
+        "valid": true,
+        "oid": <string>,
+        "stat": {
+          "ctime_sec": <number>,
+          "ctime_nsec": <number>,
+          "mtime_sec": <number>,
+          "mtime_nsec": <number>,
+          "device": <number>,
+          "inode": <number>,
+          "uid": <number>,
+          "gid": <number>,
+          "size": 0
+        }
+      },
+      "flags": 6,
+      "show_other_directories": true,
+      "hide_empty_directories": true,
+      "excludes_per_dir": ".gitignore"
     }
   }
 }
-- 
2.22.0.rc0.322.g2b0371e29a


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v2 05/10] split-index.c: dump "link" extension as json
  2019-06-24 13:02 [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Nguyễn Thái Ngọc Duy
                   ` (3 preceding siblings ...)
  2019-06-24 13:02 ` [PATCH v2 04/10] dir.c: dump "UNTR" " Nguyễn Thái Ngọc Duy
@ 2019-06-24 13:02 ` Nguyễn Thái Ngọc Duy
  2019-06-24 20:06   ` Jeff Hostetler
                     ` (2 more replies)
  2019-06-24 13:02 ` [PATCH v2 06/10] fsmonitor.c: dump "FSMN" " Nguyễn Thái Ngọc Duy
                   ` (5 subsequent siblings)
  10 siblings, 3 replies; 43+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2019-06-24 13:02 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin,
	Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 json-writer.c             | 14 ++++++++++++++
 json-writer.h             |  3 +++
 split-index.c             |  9 ++++++++-
 t/t3011-ls-files-json.sh  | 14 ++++++++++++++
 t/t3011/split-index (new) | 39 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/json-writer.c b/json-writer.c
index 0608726512..c0bd302e4e 100644
--- a/json-writer.c
+++ b/json-writer.c
@@ -1,4 +1,5 @@
 #include "cache.h"
+#include "ewah/ewok.h"
 #include "json-writer.h"
 
 void jw_init(struct json_writer *jw)
@@ -224,6 +225,19 @@ void jw_object_stat_data(struct json_writer *jw, const char *name,
 	jw_end(jw);
 }
 
+static void dump_ewah_one(size_t pos, void *jw)
+{
+	jw_array_intmax(jw, pos);
+}
+
+void jw_object_ewah(struct json_writer *jw, const char *key,
+		    struct ewah_bitmap *ewah)
+{
+	jw_object_inline_begin_array(jw, key);
+	ewah_each_bit(ewah, dump_ewah_one, jw);
+	jw_end(jw);
+}
+
 static void increase_indent(struct strbuf *sb,
 			    const struct json_writer *jw,
 			    int indent)
diff --git a/json-writer.h b/json-writer.h
index c3d0fbd1ef..07d841d52a 100644
--- a/json-writer.h
+++ b/json-writer.h
@@ -45,6 +45,7 @@
 #include "git-compat-util.h"
 #include "strbuf.h"
 
+struct ewah_bitmap;
 struct stat_data;
 
 struct json_writer
@@ -87,6 +88,8 @@ void jw_object_null(struct json_writer *jw, const char *key);
 void jw_object_filemode(struct json_writer *jw, const char *key, mode_t value);
 void jw_object_stat_data(struct json_writer *jw, const char *key,
 			 const struct stat_data *sd);
+void jw_object_ewah(struct json_writer *jw, const char *key,
+		    struct ewah_bitmap *ewah);
 void jw_object_sub_jw(struct json_writer *jw, const char *key,
 		      const struct json_writer *value);
 
diff --git a/split-index.c b/split-index.c
index e6154e4ea9..41552bf771 100644
--- a/split-index.c
+++ b/split-index.c
@@ -1,4 +1,5 @@
 #include "cache.h"
+#include "json-writer.h"
 #include "split-index.h"
 #include "ewah/ewok.h"
 
@@ -25,7 +26,7 @@ int read_link_extension(struct index_state *istate,
 	data += the_hash_algo->rawsz;
 	sz -= the_hash_algo->rawsz;
 	if (!sz)
-		return 0;
+		goto done;
 	si->delete_bitmap = ewah_new();
 	ret = ewah_read_mmap(si->delete_bitmap, data, sz);
 	if (ret < 0)
@@ -38,6 +39,12 @@ int read_link_extension(struct index_state *istate,
 		return error("corrupt replace bitmap in link extension");
 	if (ret != sz)
 		return error("garbage at the end of link extension");
+done:
+	if (istate->jw) {
+		jw_object_string(istate->jw, "oid", oid_to_hex(&si->base_oid));
+		jw_object_ewah(istate->jw, "delete_bitmap", si->delete_bitmap);
+		jw_object_ewah(istate->jw, "replace_bitmap", si->replace_bitmap);
+	}
 	return 0;
 }
 
diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
index 082fe8e966..dbb572ce9d 100755
--- a/t/t3011-ls-files-json.sh
+++ b/t/t3011-ls-files-json.sh
@@ -44,4 +44,18 @@ test_expect_success 'ls-files --json, main entries, UNTR and TREE' '
 	compare_json basic
 '
 
+test_expect_success 'ls-files --json, split index' '
+	git init split &&
+	(
+		cd split &&
+		echo one >one &&
+		git add one &&
+		git update-index --split-index &&
+		echo updated >>one &&
+		test_must_fail git -c splitIndex.maxPercentChange=100 update-index --refresh &&
+		cp ../filter.sed . &&
+		compare_json split-index
+	)
+'
+
 test_done
diff --git a/t/t3011/split-index b/t/t3011/split-index
new file mode 100644
index 0000000000..cdcc4ddded
--- /dev/null
+++ b/t/t3011/split-index
@@ -0,0 +1,39 @@
+{
+  "version": 2,
+  "oid": <string>,
+  "mtime_sec": <number>,
+  "mtime_nsec": <number>,
+  "entries": [
+    {
+      "id": 0,
+      "name": "",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 4
+      },
+      "file_offset": <number>
+    }
+  ],
+  "extensions": {
+    "link": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "oid": <string>,
+      "delete_bitmap": [
+      ],
+      "replace_bitmap": [
+        0
+      ]
+    }
+  }
+}
-- 
2.22.0.rc0.322.g2b0371e29a


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v2 06/10] fsmonitor.c: dump "FSMN" extension as json
  2019-06-24 13:02 [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Nguyễn Thái Ngọc Duy
                   ` (4 preceding siblings ...)
  2019-06-24 13:02 ` [PATCH v2 05/10] split-index.c: dump "link" " Nguyễn Thái Ngọc Duy
@ 2019-06-24 13:02 ` Nguyễn Thái Ngọc Duy
  2019-06-24 13:02 ` [PATCH v2 07/10] resolve-undo.c: dump "REUC" " Nguyễn Thái Ngọc Duy
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 43+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2019-06-24 13:02 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin,
	Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 fsmonitor.c              |  6 ++++++
 t/t3011-ls-files-json.sh | 14 +++++++++++++-
 t/t3011/fsmonitor (new)  | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/fsmonitor.c b/fsmonitor.c
index 1dee0aded1..5ed55ad176 100644
--- a/fsmonitor.c
+++ b/fsmonitor.c
@@ -3,6 +3,7 @@
 #include "dir.h"
 #include "ewah/ewok.h"
 #include "fsmonitor.h"
+#include "json-writer.h"
 #include "run-command.h"
 #include "strbuf.h"
 
@@ -50,6 +51,11 @@ int read_fsmonitor_extension(struct index_state *istate, const void *data,
 	}
 	istate->fsmonitor_dirty = fsmonitor_dirty;
 
+	if (istate->jw) {
+		jw_object_intmax(istate->jw, "version", hdr_version);
+		jw_object_intmax(istate->jw, "last_update", istate->fsmonitor_last_update);
+		jw_object_ewah(istate->jw, "dirty", fsmonitor_dirty);
+	}
 	trace_printf_key(&trace_fsmonitor, "read fsmonitor extension successful");
 	return 0;
 }
diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
index dbb572ce9d..25215f83ae 100755
--- a/t/t3011-ls-files-json.sh
+++ b/t/t3011-ls-files-json.sh
@@ -36,7 +36,7 @@ test_expect_success 'setup' '
 	git add -N ita &&
 
 	strip_number ctime_sec ctime_nsec mtime_sec mtime_nsec &&
-	strip_number device inode uid gid file_offset ext_size &&
+	strip_number device inode uid gid file_offset ext_size last_update &&
 	strip_string oid ident
 '
 
@@ -58,4 +58,16 @@ test_expect_success 'ls-files --json, split index' '
 	)
 '
 
+test_expect_success 'ls-files --json, fsmonitor extension ' '
+	git init fsmonitor &&
+	(
+		cd fsmonitor &&
+		echo one >one &&
+		git add one &&
+		git update-index --fsmonitor &&
+		cp ../filter.sed . &&
+		compare_json fsmonitor
+	)
+'
+
 test_done
diff --git a/t/t3011/fsmonitor b/t/t3011/fsmonitor
new file mode 100644
index 0000000000..17f2d4a664
--- /dev/null
+++ b/t/t3011/fsmonitor
@@ -0,0 +1,38 @@
+{
+  "version": 2,
+  "oid": <string>,
+  "mtime_sec": <number>,
+  "mtime_nsec": <number>,
+  "entries": [
+    {
+      "id": 0,
+      "name": "one",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 4
+      },
+      "file_offset": <number>
+    }
+  ],
+  "extensions": {
+    "FSMN": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "version": 1,
+      "last_update": <number>,
+      "dirty": [
+        0
+      ]
+    }
+  }
+}
-- 
2.22.0.rc0.322.g2b0371e29a


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v2 07/10] resolve-undo.c: dump "REUC" extension as json
  2019-06-24 13:02 [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Nguyễn Thái Ngọc Duy
                   ` (5 preceding siblings ...)
  2019-06-24 13:02 ` [PATCH v2 06/10] fsmonitor.c: dump "FSMN" " Nguyễn Thái Ngọc Duy
@ 2019-06-24 13:02 ` Nguyễn Thái Ngọc Duy
  2019-06-24 13:02 ` [PATCH v2 08/10] read-cache.c: dump "EOIE" " Nguyễn Thái Ngọc Duy
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 43+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2019-06-24 13:02 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin,
	Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 read-cache.c             |  2 +-
 resolve-undo.c           | 30 +++++++++++++++++-
 resolve-undo.h           |  4 ++-
 t/t3011-ls-files-json.sh | 20 ++++++++++++
 t/t3011/rerere (new)     | 66 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 119 insertions(+), 3 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index a70df4b0a5..e5183636fc 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1719,7 +1719,7 @@ static int read_index_extension(struct index_state *istate,
 		istate->cache_tree = cache_tree_read(data, sz, istate->jw);
 		break;
 	case CACHE_EXT_RESOLVE_UNDO:
-		istate->resolve_undo = resolve_undo_read(data, sz);
+		istate->resolve_undo = resolve_undo_read(data, sz, istate->jw);
 		break;
 	case CACHE_EXT_LINK:
 		ret = read_link_extension(istate, data, sz);
diff --git a/resolve-undo.c b/resolve-undo.c
index 236320f179..68921e3dfe 100644
--- a/resolve-undo.c
+++ b/resolve-undo.c
@@ -1,5 +1,6 @@
 #include "cache.h"
 #include "dir.h"
+#include "json-writer.h"
 #include "resolve-undo.h"
 #include "string-list.h"
 
@@ -49,7 +50,30 @@ void resolve_undo_write(struct strbuf *sb, struct string_list *resolve_undo)
 	}
 }
 
-struct string_list *resolve_undo_read(const char *data, unsigned long size)
+static void dump_resolve_undo(struct json_writer *jw,
+			      const char *path,
+			      const struct resolve_undo_info *ui)
+{
+	int i;
+
+	if (!jw)
+		return;
+
+	jw_array_inline_begin_object(jw);
+	jw_object_string(jw, "path", path);
+
+	jw_object_inline_begin_array(jw, "stages");
+	for (i = 0; i < 3; i++) {
+		jw_array_inline_begin_object(jw);
+		jw_object_filemode(jw, "mode", ui->mode[i]);
+		jw_object_string(jw, "oid", oid_to_hex(&ui->oid[i]));
+		jw_end(jw);
+	}
+	jw_end(jw);
+}
+
+struct string_list *resolve_undo_read(const char *data, unsigned long size,
+				      struct json_writer *jw)
 {
 	struct string_list *resolve_undo;
 	size_t len;
@@ -59,6 +83,7 @@ struct string_list *resolve_undo_read(const char *data, unsigned long size)
 
 	resolve_undo = xcalloc(1, sizeof(*resolve_undo));
 	resolve_undo->strdup_strings = 1;
+	jw_object_inline_begin_array_gently(jw, "entries");
 
 	while (size) {
 		struct string_list_item *lost;
@@ -94,7 +119,10 @@ struct string_list *resolve_undo_read(const char *data, unsigned long size)
 			size -= rawsz;
 			data += rawsz;
 		}
+
+		dump_resolve_undo(jw, lost->string, ui);
 	}
+	jw_end_gently(jw);
 	return resolve_undo;
 
 error:
diff --git a/resolve-undo.h b/resolve-undo.h
index 2b3f0f901e..46b4e93a7e 100644
--- a/resolve-undo.h
+++ b/resolve-undo.h
@@ -3,6 +3,8 @@
 
 #include "cache.h"
 
+struct json_writer;
+
 struct resolve_undo_info {
 	unsigned int mode[3];
 	struct object_id oid[3];
@@ -10,7 +12,7 @@ struct resolve_undo_info {
 
 void record_resolve_undo(struct index_state *, struct cache_entry *);
 void resolve_undo_write(struct strbuf *, struct string_list *);
-struct string_list *resolve_undo_read(const char *, unsigned long);
+struct string_list *resolve_undo_read(const char *, unsigned long, struct json_writer *);
 void resolve_undo_clear_index(struct index_state *);
 int unmerge_index_entry_at(struct index_state *, int);
 void unmerge_index(struct index_state *, const struct pathspec *);
diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
index 25215f83ae..dc57138f5b 100755
--- a/t/t3011-ls-files-json.sh
+++ b/t/t3011-ls-files-json.sh
@@ -70,4 +70,24 @@ test_expect_success 'ls-files --json, fsmonitor extension ' '
 	)
 '
 
+test_expect_success 'ls-files --json, rerere extension' '
+	git init rerere &&
+	(
+		cd rerere &&
+		mkdir fi &&
+		test_commit initial fi/le first &&
+		git branch side &&
+		test_commit second fi/le second &&
+		git checkout side &&
+		test_commit third fi/le third &&
+		git checkout master &&
+		git config rerere.enabled true &&
+		test_must_fail git merge side &&
+		echo resolved >fi/le &&
+		git add fi/le &&
+		cp ../filter.sed . &&
+		compare_json rerere
+	)
+'
+
 test_done
diff --git a/t/t3011/rerere b/t/t3011/rerere
new file mode 100644
index 0000000000..a8ec4b16ee
--- /dev/null
+++ b/t/t3011/rerere
@@ -0,0 +1,66 @@
+{
+  "version": 2,
+  "oid": <string>,
+  "mtime_sec": <number>,
+  "mtime_nsec": <number>,
+  "entries": [
+    {
+      "id": 0,
+      "name": "fi/le",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 9
+      },
+      "file_offset": <number>
+    }
+  ],
+  "extensions": {
+    "TREE": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "root": {
+        "oid": null,
+        "subdirs": [
+          {
+            "name": "fi",
+            "oid": null,
+            "subdirs": [
+            ]
+          }
+        ]
+      }
+    },
+    "REUC": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "entries": [
+        {
+          "path": "fi/le",
+          "stages": [
+            {
+              "mode": "100644",
+              "oid": <string>
+            },
+            {
+              "mode": "100644",
+              "oid": <string>
+            },
+            {
+              "mode": "100644",
+              "oid": <string>
+            }
+          ]
+        }
+      ]
+    }
+  }
-- 
2.22.0.rc0.322.g2b0371e29a


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v2 08/10] read-cache.c: dump "EOIE" extension as json
  2019-06-24 13:02 [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Nguyễn Thái Ngọc Duy
                   ` (6 preceding siblings ...)
  2019-06-24 13:02 ` [PATCH v2 07/10] resolve-undo.c: dump "REUC" " Nguyễn Thái Ngọc Duy
@ 2019-06-24 13:02 ` Nguyễn Thái Ngọc Duy
  2019-06-24 13:02 ` [PATCH v2 09/10] read-cache.c: dump "IEOT" " Nguyễn Thái Ngọc Duy
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 43+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2019-06-24 13:02 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin,
	Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 read-cache.c             |  8 ++++
 t/t3011-ls-files-json.sh | 13 ++++++
 t/t3011/eoie (new)       | 96 ++++++++++++++++++++++++++++++++++++++++
 t/test-lib.sh            |  4 ++
 4 files changed, 121 insertions(+)

diff --git a/read-cache.c b/read-cache.c
index e5183636fc..37491dd03d 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1731,6 +1731,14 @@ static int read_index_extension(struct index_state *istate,
 		read_fsmonitor_extension(istate, data, sz);
 		break;
 	case CACHE_EXT_ENDOFINDEXENTRIES:
+		if (istate->jw) {
+			/* must be synchronized with read_eoie_extension() */
+			jw_object_intmax(istate->jw, "offset", get_be32(data));
+			jw_object_string(istate->jw, "oid",
+					 hash_to_hex((const unsigned char*)data + sizeof(uint32_t)));
+		}
+		/* already handled in do_read_index() */
+		break;
 	case CACHE_EXT_INDEXENTRYOFFSETTABLE:
 		/* already handled in do_read_index() */
 		break;
diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
index dc57138f5b..9f4ad4c9cf 100755
--- a/t/t3011-ls-files-json.sh
+++ b/t/t3011-ls-files-json.sh
@@ -90,4 +90,17 @@ test_expect_success 'ls-files --json, rerere extension' '
 	)
 '
 
+test_expect_success !SINGLE_CPU 'ls-files --json and multicore extensions' '
+	git init eoie &&
+	(
+		cd eoie &&
+		git config index.threads 2 &&
+		touch one two three four &&
+		git add . &&
+		cp ../filter.sed . &&
+		strip_number offset &&
+		compare_json eoie
+	)
+'
+
 test_done
diff --git a/t/t3011/eoie b/t/t3011/eoie
new file mode 100644
index 0000000000..85ec61517b
--- /dev/null
+++ b/t/t3011/eoie
@@ -0,0 +1,96 @@
+{
+  "version": 2,
+  "oid": <string>,
+  "mtime_sec": <number>,
+  "mtime_nsec": <number>,
+  "entries": [
+    {
+      "id": 0,
+      "name": "four",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 0
+      },
+      "file_offset": <number>
+    },
+    {
+      "id": 1,
+      "name": "one",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 0
+      },
+      "file_offset": <number>
+    },
+    {
+      "id": 2,
+      "name": "three",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 0
+      },
+      "file_offset": <number>
+    },
+    {
+      "id": 3,
+      "name": "two",
+      "mode": "100644",
+      "flags": 0,
+      "oid": <string>,
+      "stat": {
+        "ctime_sec": <number>,
+        "ctime_nsec": <number>,
+        "mtime_sec": <number>,
+        "mtime_nsec": <number>,
+        "device": <number>,
+        "inode": <number>,
+        "uid": <number>,
+        "gid": <number>,
+        "size": 0
+      },
+      "file_offset": <number>
+    }
+  ],
+  "extensions": {
+    "IEOT": {
+      "file_offset": <number>,
+      "ext_size": <number>
+    },
+    "EOIE": {
+      "file_offset": <number>,
+      "ext_size": <number>,
+      "offset": <number>,
+      "oid": <string>
+    }
+  }
+}
diff --git a/t/test-lib.sh b/t/test-lib.sh
index 4b346467df..9d5b273b40 100644
--- a/t/test-lib.sh
+++ b/t/test-lib.sh
@@ -1611,3 +1611,7 @@ test_lazy_prereq REBASE_P '
 test_lazy_prereq FAIL_PREREQS '
 	test -n "$GIT_TEST_FAIL_PREREQS"
 '
+
+test_lazy_prereq SINGLE_CPU '
+	test "$(test-tool online-cpus)" -eq 1
+'
-- 
2.22.0.rc0.322.g2b0371e29a


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v2 09/10] read-cache.c: dump "IEOT" extension as json
  2019-06-24 13:02 [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Nguyễn Thái Ngọc Duy
                   ` (7 preceding siblings ...)
  2019-06-24 13:02 ` [PATCH v2 08/10] read-cache.c: dump "EOIE" " Nguyễn Thái Ngọc Duy
@ 2019-06-24 13:02 ` Nguyễn Thái Ngọc Duy
  2019-06-24 13:02 ` [PATCH v2 10/10] t3008: use the new SINGLE_CPU prereq Nguyễn Thái Ngọc Duy
  2019-06-24 18:00 ` [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Johannes Schindelin
  10 siblings, 0 replies; 43+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2019-06-24 13:02 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin,
	Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 read-cache.c | 43 ++++++++++++++++++++++++++++++++++++-------
 t/t3011/eoie | 13 ++++++++++++-
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 37491dd03d..c26edcc9d9 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1693,6 +1693,7 @@ static int verify_hdr(const struct cache_header *hdr, unsigned long size)
 	return 0;
 }
 
+static struct index_entry_offset_table *do_read_ieot_extension(struct index_state *, const char *, uint32_t);
 static int read_index_extension(struct index_state *istate,
 				const char *map,
 				unsigned long *offset)
@@ -1740,7 +1741,11 @@ static int read_index_extension(struct index_state *istate,
 		/* already handled in do_read_index() */
 		break;
 	case CACHE_EXT_INDEXENTRYOFFSETTABLE:
-		/* already handled in do_read_index() */
+		if (istate->jw) {
+			free(do_read_ieot_extension(istate, data, sz));
+		} else {
+			/* already handled in do_read_index() */
+		}
 		break;
 	default:
 		if (*ext < 'A' || 'Z' < *ext)
@@ -1938,7 +1943,7 @@ struct index_entry_offset_table
 	struct index_entry_offset entries[FLEX_ARRAY];
 };
 
-static struct index_entry_offset_table *read_ieot_extension(const char *mmap, size_t mmap_size, size_t offset);
+static struct index_entry_offset_table *read_ieot_extension(struct index_state *istate, const char *mmap, size_t mmap_size, size_t offset);
 static void write_ieot_extension(struct strbuf *sb, struct index_entry_offset_table *ieot);
 
 static size_t read_eoie_extension(const char *mmap, size_t mmap_size);
@@ -2292,7 +2297,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	 * to multi-thread the reading of the cache entries.
 	 */
 	if (extension_offset && nr_threads > 1)
-		ieot = read_ieot_extension(mmap, mmap_size, extension_offset);
+		ieot = read_ieot_extension(istate, mmap, mmap_size, extension_offset);
 
 	if (ieot) {
 		src_offset += load_cache_entries_threaded(istate, mmap, mmap_size, nr_threads, ieot);
@@ -3634,12 +3639,13 @@ static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context,
 
 #define IEOT_VERSION	(1)
 
-static struct index_entry_offset_table *read_ieot_extension(const char *mmap, size_t mmap_size, size_t offset)
+static struct index_entry_offset_table *read_ieot_extension(
+		struct index_state *istate,
+		const char *mmap, size_t mmap_size,
+		size_t offset)
 {
 	const char *index = NULL;
-	uint32_t extsize, ext_version;
-	struct index_entry_offset_table *ieot;
-	int i, nr;
+	uint32_t extsize;
 
 	/* find the IEOT extension */
 	if (!offset)
@@ -3655,6 +3661,17 @@ static struct index_entry_offset_table *read_ieot_extension(const char *mmap, si
 	}
 	if (!index)
 		return NULL;
+	return do_read_ieot_extension(istate, index, extsize);
+}
+
+static struct index_entry_offset_table *do_read_ieot_extension(
+		struct index_state *istate,
+		const char *index,
+		uint32_t extsize)
+{
+	uint32_t ext_version;
+	struct index_entry_offset_table *ieot;
+	int i, nr;
 
 	/* validate the version is IEOT_VERSION */
 	ext_version = get_be32(index);
@@ -3670,6 +3687,10 @@ static struct index_entry_offset_table *read_ieot_extension(const char *mmap, si
 		error("invalid number of IEOT entries %d", nr);
 		return NULL;
 	}
+	if (istate->jw) {
+		jw_object_intmax(istate->jw, "version", ext_version);
+		jw_object_inline_begin_array(istate->jw, "entries");
+	}
 	ieot = xmalloc(sizeof(struct index_entry_offset_table)
 		       + (nr * sizeof(struct index_entry_offset)));
 	ieot->nr = nr;
@@ -3678,7 +3699,15 @@ static struct index_entry_offset_table *read_ieot_extension(const char *mmap, si
 		index += sizeof(uint32_t);
 		ieot->entries[i].nr = get_be32(index);
 		index += sizeof(uint32_t);
+
+		if (istate->jw) {
+			jw_array_inline_begin_object(istate->jw);
+			jw_object_intmax(istate->jw, "offset", ieot->entries[i].offset);
+			jw_object_intmax(istate->jw, "count", ieot->entries[i].nr);
+			jw_end(istate->jw);
+		}
 	}
+	jw_end_gently(istate->jw);
 
 	return ieot;
 }
diff --git a/t/t3011/eoie b/t/t3011/eoie
index 85ec61517b..66a0feb3b6 100644
--- a/t/t3011/eoie
+++ b/t/t3011/eoie
@@ -84,7 +84,18 @@
   "extensions": {
     "IEOT": {
       "file_offset": <number>,
-      "ext_size": <number>
+      "ext_size": <number>,
+      "version": 1,
+      "entries": [
+        {
+          "offset": <number>,
+          "count": 2
+        },
+        {
+          "offset": <number>,
+          "count": 2
+        }
+      ]
     },
     "EOIE": {
       "file_offset": <number>,
-- 
2.22.0.rc0.322.g2b0371e29a


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v2 10/10] t3008: use the new SINGLE_CPU prereq
  2019-06-24 13:02 [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Nguyễn Thái Ngọc Duy
                   ` (8 preceding siblings ...)
  2019-06-24 13:02 ` [PATCH v2 09/10] read-cache.c: dump "IEOT" " Nguyễn Thái Ngọc Duy
@ 2019-06-24 13:02 ` Nguyễn Thái Ngọc Duy
  2019-06-24 18:00 ` [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Johannes Schindelin
  10 siblings, 0 replies; 43+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2019-06-24 13:02 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin,
	Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 t/t3008-ls-files-lazy-init-name-hash.sh | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/t/t3008-ls-files-lazy-init-name-hash.sh b/t/t3008-ls-files-lazy-init-name-hash.sh
index 64f047332b..7f918c05f6 100755
--- a/t/t3008-ls-files-lazy-init-name-hash.sh
+++ b/t/t3008-ls-files-lazy-init-name-hash.sh
@@ -4,15 +4,9 @@ test_description='Test the lazy init name hash with various folder structures'
 
 . ./test-lib.sh
 
-if test 1 -eq $($GIT_BUILD_DIR/t/helper/test-tool online-cpus)
-then
-	skip_all='skipping lazy-init tests, single cpu'
-	test_done
-fi
-
 LAZY_THREAD_COST=2000
 
-test_expect_success 'no buffer overflow in lazy_init_name_hash' '
+test_expect_success !SINGLE_CPU 'no buffer overflow in lazy_init_name_hash' '
 	(
 	    test_seq $LAZY_THREAD_COST | sed "s/^/a_/" &&
 	    echo b/b/b &&
-- 
2.22.0.rc0.322.g2b0371e29a


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json
  2019-06-24 13:02 [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Nguyễn Thái Ngọc Duy
                   ` (9 preceding siblings ...)
  2019-06-24 13:02 ` [PATCH v2 10/10] t3008: use the new SINGLE_CPU prereq Nguyễn Thái Ngọc Duy
@ 2019-06-24 18:00 ` Johannes Schindelin
  2019-06-24 18:39   ` Jeff Hostetler
  2019-06-25  9:05   ` Duy Nguyen
  10 siblings, 2 replies; 43+ messages in thread
From: Johannes Schindelin @ 2019-06-24 18:00 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: git, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason

[-- Attachment #1: Type: text/plain, Size: 1267 bytes --]

Hi Duy,

On Mon, 24 Jun 2019, Nguyễn Thái Ngọc Duy wrote:

> - json field names now use '_' instead of '.' to be friendlier to some
>   languages. I stick to underscore_name instead of camelCase because
>   the former is closer to what we use

This is not a good reason. People who are used to read JSON will stumble
over this all the time because it is so uncommon.

> - extension location is printed, in case you need to decode the
>   extension by yourself (previously only the size is printed)
> - all extensions are printed in the same order they appear in the file
>   (previously eoie and ieot are printed first because that's how we
>   parse)
> - resolve undo extension is reorganized a bit to be easier to read
> - tests added. Example json files are in t/t3011

It might actually make sense to optionally disable showing extensions.

You also forgot to mention that you explicitly disable handling
`<pathspec>`, which I find a bit odd, personally, as that would probably
come in real handy at times, especially when we offer this as a better way
for 3rd-party applications to interact with Git (which I think will be the
use case for this feature that will be _far_ more common than using it for
debugging).

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json
  2019-06-24 18:00 ` [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Johannes Schindelin
@ 2019-06-24 18:39   ` Jeff Hostetler
  2019-06-25  9:05   ` Duy Nguyen
  1 sibling, 0 replies; 43+ messages in thread
From: Jeff Hostetler @ 2019-06-24 18:39 UTC (permalink / raw)
  To: Johannes Schindelin, Nguyễn Thái Ngọc Duy
  Cc: git, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason



On 6/24/2019 2:00 PM, Johannes Schindelin wrote:
> Hi Duy,
> 
> On Mon, 24 Jun 2019, Nguyễn Thái Ngọc Duy wrote:
> 
>> - json field names now use '_' instead of '.' to be friendlier to some
>>    languages. I stick to underscore_name instead of camelCase because
>>    the former is closer to what we use
> 
> This is not a good reason. People who are used to read JSON will stumble
> over this all the time because it is so uncommon.

Getting rid of "." and "-" in field names is the important part.
These confuse some languages and make us drop into object["<field>"]
syntax in my experience.

As for "_" or camelCase (or PascalCase), I'm not sure it matters one
way or the other.  Personally, I'd vote for underscores.

Jeff

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 01/10] ls-files: add --json to dump the index
  2019-06-24 13:02 ` [PATCH v2 01/10] ls-files: add --json to dump the index Nguyễn Thái Ngọc Duy
@ 2019-06-24 19:15   ` Jeff Hostetler
  2019-06-24 20:04     ` Junio C Hamano
                       ` (2 more replies)
  2019-06-25  9:05   ` Thomas Gummerer
                     ` (2 subsequent siblings)
  3 siblings, 3 replies; 43+ messages in thread
From: Jeff Hostetler @ 2019-06-24 19:15 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy, git
  Cc: Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin



On 6/24/2019 9:02 AM, Nguyễn Thái Ngọc Duy wrote:
> So far we don't have a command to basically dump the index file out,
> with all its glory details. Checking some info, for example, stat
> time, usually involves either writing new code or firing up "xxd" and
> decoding values by yourself.
> 
> This --json is supposed to help that. It dumps the index in a human
> readable format but also easy to be processed with tools. And it will
> print almost enough info to reconstruct the index later.
> 
> In this patch we only dump the main part, not extensions. But at the
> end of the series, the entire index is dumped. The end result could be
> very verbose even on a small repository such as git.git.
> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---

...

> diff --git a/json-writer.c b/json-writer.c
> index aadb9dbddc..0608726512 100644
> --- a/json-writer.c
> +++ b/json-writer.c
> @@ -202,6 +202,28 @@ void jw_object_null(struct json_writer *jw, const char *key)
>   	strbuf_addstr(&jw->json, "null");
>   }
>   
> +void jw_object_filemode(struct json_writer *jw, const char *key, mode_t mode)
> +{
> +	object_common(jw, key);
> +	strbuf_addf(&jw->json, "\"%06o\"", mode);
> +}
> +
> +void jw_object_stat_data(struct json_writer *jw, const char *name,
> +			 const struct stat_data *sd)

Should this be in json_writer.c or in read-cache.c ?
Currently, json_writer.c is concerned with formatting
JSON on basic/scalar types.  Do we want to start
extending it to handle arbitrary structures?  Or would
it be better for the code that defines/manipulates the
structure to define a "stat_data_dump_json()" function.

I'm torn on the jw_object_filemode() function, JSON format
limits us to decimal integers and there are places where
I'd like to have hex, or in this case octal values.

I'm thinking it'd be better to have a helper function in
read-cache.c that formats a local strbuf and calls
js_object_string(&jw, key, buf);


> +{
> +	jw_object_inline_begin_object(jw, name);
> +	jw_object_intmax(jw, "ctime_sec", sd->sd_ctime.sec);
> +	jw_object_intmax(jw, "ctime_nsec", sd->sd_ctime.nsec);
> +	jw_object_intmax(jw, "mtime_sec", sd->sd_mtime.sec);
> +	jw_object_intmax(jw, "mtime_nsec", sd->sd_mtime.nsec);

It'd be nice if we could also have a formatted date
for the mtime and ctime in addition to the integer
values.  (I'm not sure whether you'd always want them
or make it a verbose option.)

> +	jw_object_intmax(jw, "device", sd->sd_dev);
> +	jw_object_intmax(jw, "inode", sd->sd_ino);
> +	jw_object_intmax(jw, "uid", sd->sd_uid);
> +	jw_object_intmax(jw, "gid", sd->sd_gid);
> +	jw_object_intmax(jw, "size", sd->sd_size);
> +	jw_end(jw);
> +}
> +
>   static void increase_indent(struct strbuf *sb,
>   			    const struct json_writer *jw,
>   			    int indent)
> diff --git a/json-writer.h b/json-writer.h
> index 83906b09c1..c48c4cbf33 100644
> --- a/json-writer.h
> +++ b/json-writer.h
> @@ -42,8 +42,11 @@
>    * of the given strings.
>    */
>   
> +#include "git-compat-util.h"
>   #include "strbuf.h"
>   
> +struct stat_data;
> +
>   struct json_writer
>   {
>   	/*
> @@ -81,6 +84,9 @@ void jw_object_true(struct json_writer *jw, const char *key);
>   void jw_object_false(struct json_writer *jw, const char *key);
>   void jw_object_bool(struct json_writer *jw, const char *key, int value);
>   void jw_object_null(struct json_writer *jw, const char *key);
> +void jw_object_filemode(struct json_writer *jw, const char *key, mode_t value);
> +void jw_object_stat_data(struct json_writer *jw, const char *key,
> +			 const struct stat_data *sd);
>   void jw_object_sub_jw(struct json_writer *jw, const char *key,
>   		      const struct json_writer *value);
>   
> @@ -104,4 +110,21 @@ void jw_array_inline_begin_array(struct json_writer *jw);
>   int jw_is_terminated(const struct json_writer *jw);
>   void jw_end(struct json_writer *jw);
>   
> +/*
> + * These _gently versions accept NULL json_writer to reduce too much
> + * branching at the call site.
> + */
> +static inline void jw_object_inline_begin_array_gently(struct json_writer *jw,
> +						       const char *name)
> +{
> +	if (jw)
> +		jw_object_inline_begin_array(jw, name);
> +}
> +
> +static inline void jw_end_gently(struct json_writer *jw)
> +{
> +	if (jw)
> +		jw_end(jw);
> +}
> +
>   #endif /* JSON_WRITER_H */
> diff --git a/read-cache.c b/read-cache.c
> index 4dd22f4f6e..db5147d088 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -25,6 +25,7 @@
>   #include "fsmonitor.h"
>   #include "thread-utils.h"
>   #include "progress.h"
> +#include "json-writer.h"
>   
>   /* Mask for the name length in ce_flags in the on-disk index */
>   
> @@ -1952,6 +1953,49 @@ static void *load_index_extensions(void *_data)
>   	return NULL;
>   }
>   
> +static void dump_cache_entry(struct index_state *istate,
> +			     int index,
> +			     unsigned long offset,
> +			     const struct cache_entry *ce)
> +{
> +	struct json_writer *jw = istate->jw;
> +
> +	jw_array_inline_begin_object(jw);
> +
> +	/*
> +	 * this is technically redundant, but it's for easier
> +	 * navigation when there hundreds of entries
> +	 */
> +	jw_object_intmax(jw, "id", index);
> +
> +	jw_object_string(jw, "name", ce->name);
> +
> +	jw_object_filemode(jw, "mode", ce->ce_mode);
> +
> +	jw_object_intmax(jw, "flags", ce->ce_flags);

It would be nice to have the flags as a hex-formatted string
in addition to (or instead of) the decimal integer value.

> +	/*
> +	 * again redundant info, just so you don't have to decode
> +	 * flags values manually
> +	 */
> +	if (ce->ce_flags & CE_EXTENDED)
> +		jw_object_true(jw, "extended_flags");
> +	if (ce->ce_flags & CE_VALID)
> +		jw_object_true(jw, "assume_unchanged");
> +	if (ce->ce_flags & CE_INTENT_TO_ADD)
> +		jw_object_true(jw, "intent_to_add");
> +	if (ce->ce_flags & CE_SKIP_WORKTREE)
> +		jw_object_true(jw, "skip_worktree");
> +	if (ce_stage(ce))
> +		jw_object_intmax(jw, "stage", ce_stage(ce));
> +
> +	jw_object_string(jw, "oid", oid_to_hex(&ce->oid));
> +
> +	jw_object_stat_data(jw, "stat", &ce->ce_stat_data);
> +	jw_object_intmax(jw, "file_offset", offset);
> +
> +	jw_end(jw);
> +}
> +
>   /*
>    * A helper function that will load the specified range of cache entries
>    * from the memory mapped file and add them to the given index.
> @@ -1972,6 +2016,9 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
>   		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
>   		set_index_entry(istate, i, ce);
>   
> +		if (istate->jw)
> +			dump_cache_entry(istate, i, src_offset, ce);
> +
>   		src_offset += consumed;
>   		previous_ce = ce;
>   	}
> @@ -1983,6 +2030,8 @@ static unsigned long load_all_cache_entries(struct index_state *istate,
>   {
>   	unsigned long consumed;
>   
> +	jw_object_inline_begin_array_gently(istate->jw, "entries");
> +
>   	if (istate->version == 4) {
>   		mem_pool_init(&istate->ce_mem_pool,
>   				estimate_cache_size_from_compressed(istate->cache_nr));
> @@ -1993,6 +2042,8 @@ static unsigned long load_all_cache_entries(struct index_state *istate,
>   
>   	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
>   					0, istate->cache_nr, mmap, src_offset, NULL);
> +
> +	jw_end_gently(istate->jw);
>   	return consumed;
>   }
>   
> @@ -2120,6 +2171,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>   	size_t extension_offset = 0;
>   	int nr_threads, cpus;
>   	struct index_entry_offset_table *ieot = NULL;
> +	int jw_pretty = 1;
>   
>   	if (istate->initialized)
>   		return istate->cache_nr;
> @@ -2154,6 +2206,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>   	istate->cache_nr = ntohl(hdr->hdr_entries);
>   	istate->cache_alloc = alloc_nr(istate->cache_nr);
>   	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
> +	istate->timestamp.sec = st.st_mtime;
> +	istate->timestamp.nsec = ST_MTIME_NSEC(st);
>   	istate->initialized = 1;
>   
>   	p.istate = istate;
> @@ -2176,6 +2230,20 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>   	if (!HAVE_THREADS)
>   		nr_threads = 1;
>   
> +	if (istate->jw) {
> +		jw_object_begin(istate->jw, jw_pretty);
> +		jw_object_intmax(istate->jw, "version", istate->version);
> +		jw_object_string(istate->jw, "oid", oid_to_hex(&istate->oid));
> +		jw_object_intmax(istate->jw, "mtime_sec", istate->timestamp.sec);
> +		jw_object_intmax(istate->jw, "mtime_nsec", istate->timestamp.nsec);

again, it would be nice to also have a formated version of the mtime.

> +
> +		/*
> +		 * Threading may mess up json writing. This is for
> +		 * debugging only, so performance is not a concern.
> +		 */
> +		nr_threads = 1;

yes. we should turn off threading when dumping to json.

> +	}
> +
>   	if (nr_threads > 1) {
>   		extension_offset = read_eoie_extension(mmap, mmap_size);
>   		if (extension_offset) {
> @@ -2204,8 +2272,6 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>   		src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
>   	}
>   
> -	istate->timestamp.sec = st.st_mtime;
> -	istate->timestamp.nsec = ST_MTIME_NSEC(st);
>   
>   	/* if we created a thread, join it otherwise load the extensions on the primary thread */
>   	if (extension_offset) {
> @@ -2216,6 +2282,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>   		p.src_offset = src_offset;
>   		load_index_extensions(&p);
>   	}
> +	jw_end_gently(istate->jw);
> +
>   	munmap((void *)mmap, mmap_size);
>   
>   	/*
> 
...

Thanks
Jeff

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 04/10] dir.c: dump "UNTR" extension as json
  2019-06-24 13:02 ` [PATCH v2 04/10] dir.c: dump "UNTR" " Nguyễn Thái Ngọc Duy
@ 2019-06-24 19:32   ` Jeff Hostetler
  0 siblings, 0 replies; 43+ messages in thread
From: Jeff Hostetler @ 2019-06-24 19:32 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy, git
  Cc: Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin



On 6/24/2019 9:02 AM, Nguyễn Thái Ngọc Duy wrote:
> The big part of UNTR extension is dumped at the end instead of dumping
> as soon as we read it, because we actually "patch" some fields in
> untracked_cache_dir with EWAH bitmaps at the end.
> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>   dir.c                    | 57 +++++++++++++++++++++++++++++++++++++++-
>   dir.h                    |  4 ++-
>   json-writer.h            |  6 +++++
>   read-cache.c             |  2 +-
>   t/t3011-ls-files-json.sh |  3 ++-
>   t/t3011/basic            | 39 +++++++++++++++++++++++++++
>   6 files changed, 107 insertions(+), 4 deletions(-)
> 
> diff --git a/dir.c b/dir.c
> index ba4a51c296..8808577ea3 100644
> --- a/dir.c
> +++ b/dir.c
> @@ -19,6 +19,7 @@
>   #include "varint.h"
>   #include "ewah/ewok.h"
>   #include "fsmonitor.h"
> +#include "json-writer.h"
>   #include "submodule-config.h"
>   
>   /*
> @@ -2826,7 +2827,42 @@ static void load_oid_stat(struct oid_stat *oid_stat, const unsigned char *data,
>   	oid_stat->valid = 1;
>   }
>   
> -struct untracked_cache *read_untracked_extension(const void *data, unsigned long sz)
> +static void jw_object_oid_stat(struct json_writer *jw, const char *key,
> +			       const struct oid_stat *oid_stat)

I was hoping to reserve the jw_, jw_object_, and jw_array_ prefixes
to refer to public functions in the json-writer.[ch] files, rather
than private functions elsewhere in the code.  I think it is less
confusing that way.  IMHO.

It's fine to have such helper functions for local structure types,
but I think it'd be better for them to not be named like other
public functions.

> +{
> +	jw_object_inline_begin_object(jw, key);
> +	jw_object_bool(jw, "valid", oid_stat->valid);
> +	jw_object_string(jw, "oid", oid_to_hex(&oid_stat->oid));
> +	jw_object_stat_data(jw, "stat", &oid_stat->stat);
> +	jw_end(jw);
> +}
> +
> +static void jw_object_untracked_cache_dir(struct json_writer *jw,
> +					  const struct untracked_cache_dir *ucd)
> +{
> +	int i;
> +
> +	jw_object_bool(jw, "valid", ucd->valid);
> +	jw_object_bool(jw, "check-only", ucd->check_only);
> +	jw_object_stat_data(jw, "stat", &ucd->stat_data);
> +	jw_object_string(jw, "exclude-oid", oid_to_hex(&ucd->exclude_oid));
> +	jw_object_inline_begin_array(jw, "untracked");
> +	for (i = 0; i < ucd->untracked_nr; i++)
> +		jw_array_string(jw, ucd->untracked[i]);
> +	jw_end(jw);
> +
> +	jw_object_inline_begin_object(jw, "dirs");
> +	for (i = 0; i < ucd->dirs_nr; i++) {
> +		jw_object_inline_begin_object(jw, ucd->dirs[i]->name);
> +		jw_object_untracked_cache_dir(jw, ucd->dirs[i]);
> +		jw_end(jw);
> +	}
> +	jw_end(jw);
> +}
> +
> +struct untracked_cache *read_untracked_extension(const void *data,
> +						 unsigned long sz,
> +						 struct json_writer *jw)
>   {
>   	struct untracked_cache *uc;
>   	struct read_data rd;
> @@ -2864,6 +2900,19 @@ struct untracked_cache *read_untracked_extension(const void *data, unsigned long
>   	uc->dir_flags = get_be32(next + ouc_offset(dir_flags));
>   	exclude_per_dir = (const char *)next + exclude_per_dir_offset;
>   	uc->exclude_per_dir = xstrdup(exclude_per_dir);
> +
> +	if (jw) {
> +		jw_object_string(jw, "ident", ident);
> +		jw_object_oid_stat(jw, "info_exclude", &uc->ss_info_exclude);
> +		jw_object_oid_stat(jw, "excludes_file", &uc->ss_excludes_file);
> +		jw_object_intmax(jw, "flags", uc->dir_flags);
> +		if (uc->dir_flags & DIR_SHOW_OTHER_DIRECTORIES)
> +			jw_object_bool(jw, "show_other_directories", 1);
> +		if (uc->dir_flags & DIR_HIDE_EMPTY_DIRECTORIES)
> +			jw_object_bool(jw, "hide_empty_directories", 1);
> +		jw_object_string(jw, "excludes_per_dir", uc->exclude_per_dir);
> +	}
> +
>   	/* NUL after exclude_per_dir is covered by sizeof(*ouc) */
>   	next += exclude_per_dir_offset + strlen(exclude_per_dir) + 1;
>   	if (next >= end)
> @@ -2905,6 +2954,12 @@ struct untracked_cache *read_untracked_extension(const void *data, unsigned long
>   	ewah_each_bit(rd.sha1_valid, read_oid, &rd);
>   	next = rd.data;
>   
> +	if (jw) {
> +		jw_object_inline_begin_object(jw, "root");
> +		jw_object_untracked_cache_dir(jw, uc->root);
> +		jw_end(jw);
> +	}
> +
>   done:
>   	free(rd.ucd);
>   	ewah_free(rd.valid);
> diff --git a/dir.h b/dir.h
> index 680079bbe3..80efdd05c4 100644
> --- a/dir.h
> +++ b/dir.h
> @@ -6,6 +6,8 @@
>   #include "cache.h"
>   #include "strbuf.h"
>   
> +struct json_writer;
> +
>   struct dir_entry {
>   	unsigned int len;
>   	char name[FLEX_ARRAY]; /* more */
> @@ -362,7 +364,7 @@ void untracked_cache_remove_from_index(struct index_state *, const char *);
>   void untracked_cache_add_to_index(struct index_state *, const char *);
>   
>   void free_untracked_cache(struct untracked_cache *);
> -struct untracked_cache *read_untracked_extension(const void *data, unsigned long sz);
> +struct untracked_cache *read_untracked_extension(const void *data, unsigned long sz, struct json_writer *jw);
>   void write_untracked_extension(struct strbuf *out, struct untracked_cache *untracked);
>   void add_untracked_cache(struct index_state *istate);
>   void remove_untracked_cache(struct index_state *istate);
> diff --git a/json-writer.h b/json-writer.h
> index c48c4cbf33..c3d0fbd1ef 100644
> --- a/json-writer.h
> +++ b/json-writer.h
> @@ -121,6 +121,12 @@ static inline void jw_object_inline_begin_array_gently(struct json_writer *jw,
>   		jw_object_inline_begin_array(jw, name);
>   }
>   
> +static inline void jw_array_inline_begin_object_gently(struct json_writer *jw)
> +{
> +	if (jw)
> +		jw_array_inline_begin_object(jw);
> +}
> +
>   static inline void jw_end_gently(struct json_writer *jw)
>   {
>   	if (jw)

I'm not sure about the need for these _gently versions, but
maybe make them macros in json-writer.h

Jeff

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 01/10] ls-files: add --json to dump the index
  2019-06-24 19:15   ` Jeff Hostetler
@ 2019-06-24 20:04     ` Junio C Hamano
  2019-06-25  9:21     ` Johannes Schindelin
  2019-06-25  9:52     ` Duy Nguyen
  2 siblings, 0 replies; 43+ messages in thread
From: Junio C Hamano @ 2019-06-24 20:04 UTC (permalink / raw)
  To: Jeff Hostetler
  Cc: Nguyễn Thái Ngọc Duy, git, Jeff King,
	Derrick Stolee, Ævar Arnfjörð Bjarmason,
	Johannes Schindelin

Jeff Hostetler <git@jeffhostetler.com> writes:

> On 6/24/2019 9:02 AM, Nguyễn Thái Ngọc Duy wrote:
> ...
>> +void jw_object_stat_data(struct json_writer *jw, const char *name,
>> +			 const struct stat_data *sd)
>
> Should this be in json_writer.c or in read-cache.c ?
> Currently, json_writer.c is concerned with formatting
> JSON on basic/scalar types.

That's an interesting question.

In the longer run, is it reasonable to assume that the current
callers of these functions will stay to be the only places that
would want to say "For this path, various attributes like timestamps
and other stuff are these values"?  The answer is probably no.

Would it be reasonable to assume that the future callers are all
likely to be closely related to what functions that are currently in
read-cache.c do and would appear only in that file?  I think the
answer would also be no.  Functions in entry.c may want to report
"I've created/updated these filesystem entities and their stat info
now look like so", for example.  So (assuming that we would want to
represent "struct stat" the same way no matter who wants to write it
out), I think it is reasonable to have this function on the json
side, not on Git side.

I do not think it is a bad idea to have a layer that sits above
json-writer.c and below read-cache.c that give us standardized
mapping from in-core objects (like "struct stat") to objects in json
(they are set of key-value pairs after all).  Call it json-schema.c
or something?

What other "higher than scaler" types do we foresee that we'd need
to serialize in a standardised way?  If we do not have that many
yet, perhaps a split like that might be a bit premature and we can
start by having the function in json-writer.c side, not in the API
consumer side.

Thanks.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 05/10] split-index.c: dump "link" extension as json
  2019-06-24 13:02 ` [PATCH v2 05/10] split-index.c: dump "link" " Nguyễn Thái Ngọc Duy
@ 2019-06-24 20:06   ` Jeff Hostetler
  2019-06-25 10:29     ` Duy Nguyen
  2019-07-03  9:08   ` SZEDER Gábor
  2019-07-04 20:01   ` SZEDER Gábor
  2 siblings, 1 reply; 43+ messages in thread
From: Jeff Hostetler @ 2019-06-24 20:06 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy, git
  Cc: Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin



On 6/24/2019 9:02 AM, Nguyễn Thái Ngọc Duy wrote:
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>   json-writer.c             | 14 ++++++++++++++
>   json-writer.h             |  3 +++
>   split-index.c             |  9 ++++++++-
>   t/t3011-ls-files-json.sh  | 14 ++++++++++++++
>   t/t3011/split-index (new) | 39 +++++++++++++++++++++++++++++++++++++++
>   5 files changed, 78 insertions(+), 1 deletion(-)
> 
> diff --git a/json-writer.c b/json-writer.c
> index 0608726512..c0bd302e4e 100644
> --- a/json-writer.c
> +++ b/json-writer.c
> @@ -1,4 +1,5 @@
>   #include "cache.h"
> +#include "ewah/ewok.h"
>   #include "json-writer.h"
>   
>   void jw_init(struct json_writer *jw)
> @@ -224,6 +225,19 @@ void jw_object_stat_data(struct json_writer *jw, const char *name,
>   	jw_end(jw);
>   }
>   
> +static void dump_ewah_one(size_t pos, void *jw)
> +{
> +	jw_array_intmax(jw, pos);
> +}
> +
> +void jw_object_ewah(struct json_writer *jw, const char *key,
> +		    struct ewah_bitmap *ewah)
> +{
> +	jw_object_inline_begin_array(jw, key);
> +	ewah_each_bit(ewah, dump_ewah_one, jw);
> +	jw_end(jw);
> +}
> +

As I said in an earlier commit in this series, I'd prefer
that we keep such data-structure-specific helper functions
in the source file that defines the data structure rather
than adding them to json-writer.[ch]

I'm curious how big these EWAHs will be in practice and
how useful an array of integers will be (especially as the
pretty format will be one integer per line).  Perhaps it
would helpful to have an extended example in one of the
tests.

Would it be better to have the caller of ewah_each_bit()
build a hex or bit string in a strbuf and then write it
as a single string?

Jeff

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json
  2019-06-24 18:00 ` [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Johannes Schindelin
  2019-06-24 18:39   ` Jeff Hostetler
@ 2019-06-25  9:05   ` Duy Nguyen
  2019-06-25  9:38     ` Thomas Gummerer
  2019-06-25 11:27     ` Johannes Schindelin
  1 sibling, 2 replies; 43+ messages in thread
From: Duy Nguyen @ 2019-06-25  9:05 UTC (permalink / raw)
  To: Johannes Schindelin
  Cc: Git Mailing List, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason

On Tue, Jun 25, 2019 at 1:00 AM Johannes Schindelin
<Johannes.Schindelin@gmx.de> wrote:
> > - extension location is printed, in case you need to decode the
> >   extension by yourself (previously only the size is printed)
> > - all extensions are printed in the same order they appear in the file
> >   (previously eoie and ieot are printed first because that's how we
> >   parse)
> > - resolve undo extension is reorganized a bit to be easier to read
> > - tests added. Example json files are in t/t3011
>
> It might actually make sense to optionally disable showing extensions.
>
> You also forgot to mention that you explicitly disable handling
> `<pathspec>`, which I find a bit odd, personally, as that would probably
> come in real handy at times,

No. I mentioned the land of high level languages before. Filtering in
any Python, Ruby, Scheme, JavaScript, Java is a piece of cake and much
more flexible than pathspec. Even with shell scripts, jq could do a
much better job than pathspec. If you filter by pathspec, good luck
trying that on extensions.

It's the same reason why I will not provide a flexible way to disable
extensions. I'm not starting a JSON API for Git. I provide an index
file in JSON format. You do what you want with it. You have a format
easy enough to import to native data structures of your favorite
language.

> especially when we offer this as a better way
> for 3rd-party applications to interact with Git (which I think will be the
> use case for this feature that will be _far_ more common than using it for
> debugging).

We may have conflicting goals. For me, first priority is the debug
tool for Git developers. 3rd-party support is a stretch. I could move
all this back to test-tool, then you can provide a 3rd-party API if
you want. Or I'll withdraw this series and go back to my original
plan.
-- 
Duy

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 01/10] ls-files: add --json to dump the index
  2019-06-24 13:02 ` [PATCH v2 01/10] ls-files: add --json to dump the index Nguyễn Thái Ngọc Duy
  2019-06-24 19:15   ` Jeff Hostetler
@ 2019-06-25  9:05   ` Thomas Gummerer
  2019-06-25  9:44   ` Johannes Schindelin
  2019-06-26 19:51   ` Junio C Hamano
  3 siblings, 0 replies; 43+ messages in thread
From: Thomas Gummerer @ 2019-06-25  9:05 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: git, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin

On 06/24, Nguyễn Thái Ngọc Duy wrote:
> So far we don't have a command to basically dump the index file out,
> with all its glory details. Checking some info, for example, stat
> time, usually involves either writing new code or firing up "xxd" and
> decoding values by yourself.
> 
> This --json is supposed to help that. It dumps the index in a human
> readable format but also easy to be processed with tools. And it will
> print almost enough info to reconstruct the index later.
> 
> In this patch we only dump the main part, not extensions. But at the
> end of the series, the entire index is dumped. The end result could be
> very verbose even on a small repository such as git.git.
> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  Documentation/git-ls-files.txt    |  5 +++
>  builtin/ls-files.c                | 38 +++++++++++++---
>  cache.h                           |  2 +
>  json-writer.c                     | 22 ++++++++++
>  json-writer.h                     | 23 ++++++++++
>  read-cache.c                      | 72 ++++++++++++++++++++++++++++++-
>  t/t3011-ls-files-json.sh (new +x) | 44 +++++++++++++++++++
>  t/t3011/basic (new)               | 67 ++++++++++++++++++++++++++++
>  8 files changed, 265 insertions(+), 8 deletions(-)
>
> [...]
>
> diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
> new file mode 100755
> index 0000000000..97bcd814be
> --- /dev/null
> +++ b/t/t3011-ls-files-json.sh
> @@ -0,0 +1,44 @@
> +#!/bin/sh
> +
> +test_description='ls-files dumping json'
> +
> +. ./test-lib.sh
> +
> +strip_number() {
> +	for name; do
> +		echo 's/\("'$name'":\) [0-9]\+/\1 <number>/' >>filter.sed
> +	done
> +}
> +
> +strip_string() {
> +	for name; do
> +		echo 's/\("'$name'":\) ".*"/\1 <string>/' >>filter.sed
> +	done
> +}
> +
> +compare_json() {
> +	git ls-files --debug-json >json &&
> +	sed -f filter.sed json >filtered &&
> +	test_cmp "$TEST_DIRECTORY"/t3011/"$1" filtered
> +}
> +
> +test_expect_success 'setup' '
> +	mkdir sub &&
> +	echo one >one &&
> +	git add one &&
> +	echo 2 >sub/two &&
> +	git add sub/two &&
> +
> +	echo intent-to-add >ita &&
> +	git add -N ita &&
> +
> +	strip_number ctime_sec ctime_nsec mtime_sec mtime_nsec &&
> +	strip_number device inode uid gid file_offset ext_size &&
> +	strip_string oid ident
> +'
> +
> +test_expect_success 'ls-files --json, main entries' '
> +	compare_json basic
> +'
> +
> +test_done
> diff --git a/t/t3011/basic b/t/t3011/basic
> new file mode 100644
> index 0000000000..9436445d90
> --- /dev/null
> +++ b/t/t3011/basic
> @@ -0,0 +1,67 @@
> +{
> +  "version": 3,

This will break the test suite when 'GIT_TEST_INDEX_VERSION' is set to
4 for example.  I think this applies to a few other tests in later
patches as well.

> +  "oid": <string>,
> +  "mtime_sec": <number>,
> +  "mtime_nsec": <number>,
> +  "entries": [
> +    {
> +      "id": 0,
> +      "name": "ita",
> +      "mode": "100644",
> +      "flags": 536887296,
> +      "extended_flags": true,
> +      "intent_to_add": true,
> +      "oid": <string>,
> +      "stat": {
> +        "ctime_sec": <number>,
> +        "ctime_nsec": <number>,
> +        "mtime_sec": <number>,
> +        "mtime_nsec": <number>,
> +        "device": <number>,
> +        "inode": <number>,
> +        "uid": <number>,
> +        "gid": <number>,
> +        "size": 0
> +      },
> +      "file_offset": <number>
> +    },
> +    {
> +      "id": 1,
> +      "name": "one",
> +      "mode": "100644",
> +      "flags": 0,
> +      "oid": <string>,
> +      "stat": {
> +        "ctime_sec": <number>,
> +        "ctime_nsec": <number>,
> +        "mtime_sec": <number>,
> +        "mtime_nsec": <number>,
> +        "device": <number>,
> +        "inode": <number>,
> +        "uid": <number>,
> +        "gid": <number>,
> +        "size": 4
> +      },
> +      "file_offset": <number>
> +    },
> +    {
> +      "id": 2,
> +      "name": "sub/two",
> +      "mode": "100644",
> +      "flags": 0,
> +      "oid": <string>,
> +      "stat": {
> +        "ctime_sec": <number>,
> +        "ctime_nsec": <number>,
> +        "mtime_sec": <number>,
> +        "mtime_nsec": <number>,
> +        "device": <number>,
> +        "inode": <number>,
> +        "uid": <number>,
> +        "gid": <number>,
> +        "size": 2
> +      },
> +      "file_offset": <number>
> +    }
> +  ]
> +}
> -- 
> 2.22.0.rc0.322.g2b0371e29a
> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 01/10] ls-files: add --json to dump the index
  2019-06-24 19:15   ` Jeff Hostetler
  2019-06-24 20:04     ` Junio C Hamano
@ 2019-06-25  9:21     ` Johannes Schindelin
  2019-06-25  9:52     ` Duy Nguyen
  2 siblings, 0 replies; 43+ messages in thread
From: Johannes Schindelin @ 2019-06-25  9:21 UTC (permalink / raw)
  To: Jeff Hostetler
  Cc: Nguyễn Thái Ngọc Duy, git, Junio C Hamano,
	Jeff King, Derrick Stolee, Ævar Arnfjörð Bjarmason

[-- Attachment #1: Type: text/plain, Size: 2492 bytes --]

Hi Jeff,

On Mon, 24 Jun 2019, Jeff Hostetler wrote:

> On 6/24/2019 9:02 AM, Nguyễn Thái Ngọc Duy wrote:
> [...]
> > +{
> > +	jw_object_inline_begin_object(jw, name);
> > +	jw_object_intmax(jw, "ctime_sec", sd->sd_ctime.sec);
> > +	jw_object_intmax(jw, "ctime_nsec", sd->sd_ctime.nsec);
> > +	jw_object_intmax(jw, "mtime_sec", sd->sd_mtime.sec);
> > +	jw_object_intmax(jw, "mtime_nsec", sd->sd_mtime.nsec);
>
> It'd be nice if we could also have a formatted date
> for the mtime and ctime in addition to the integer
> values.  (I'm not sure whether you'd always want them
> or make it a verbose option.)

It would be more consistent with JSON conventions to have only the
formatted date, preferably in ISO-8601 format [*1*], I would think.

And for debugging, it would also make more sense, in my opinion.

> [...]
> > diff --git a/read-cache.c b/read-cache.c
> > index 4dd22f4f6e..db5147d088 100644
> > --- a/read-cache.c
> > +++ b/read-cache.c
> > @@ -25,6 +25,7 @@
> >   #include "fsmonitor.h"
> >   #include "thread-utils.h"
> >   #include "progress.h"
> > +#include "json-writer.h"
> >
> >   /* Mask for the name length in ce_flags in the on-disk index */
> >
> > @@ -1952,6 +1953,49 @@ static void *load_index_extensions(void *_data)
> >   	return NULL;
> >   }
> >
> > +static void dump_cache_entry(struct index_state *istate,
> > +			     int index,
> > +			     unsigned long offset,
> > +			     const struct cache_entry *ce)
> > +{
> > +	struct json_writer *jw = istate->jw;
> > +
> > +	jw_array_inline_begin_object(jw);
> > +
> > +	/*
> > +	 * this is technically redundant, but it's for easier
> > +	 * navigation when there hundreds of entries
> > +	 */
> > +	jw_object_intmax(jw, "id", index);
> > +
> > +	jw_object_string(jw, "name", ce->name);
> > +
> > +	jw_object_filemode(jw, "mode", ce->ce_mode);
> > +
> > +	jw_object_intmax(jw, "flags", ce->ce_flags);
>
> It would be nice to have the flags as a hex-formatted string
> in addition to (or instead of) the decimal integer value.

Instead of, please, prefixed with "0x" to make it clear that this is hex.

And "mode" in octal, please, prefixed with "0", as that is the convention
to display file modes.

Thanks,
Dscho

Footnote *1*: It is too bad that JSON leaves the exact date format
unspecified, leading to a lot of inconsistency and putting the burden on
parsers:
https://stackoverflow.com/questions/10286204/the-right-json-date-format

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json
  2019-06-25  9:05   ` Duy Nguyen
@ 2019-06-25  9:38     ` Thomas Gummerer
  2019-06-25 11:27     ` Johannes Schindelin
  1 sibling, 0 replies; 43+ messages in thread
From: Thomas Gummerer @ 2019-06-25  9:38 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Johannes Schindelin, Git Mailing List, Junio C Hamano, Jeff King,
	Derrick Stolee, Ævar Arnfjörð Bjarmason

On 06/25, Duy Nguyen wrote:
> On Tue, Jun 25, 2019 at 1:00 AM Johannes Schindelin
> <Johannes.Schindelin@gmx.de> wrote:
> > especially when we offer this as a better way
> > for 3rd-party applications to interact with Git (which I think will be the
> > use case for this feature that will be _far_ more common than using it for
> > debugging).
> 
> We may have conflicting goals. For me, first priority is the debug
> tool for Git developers. 3rd-party support is a stretch. I could move
> all this back to test-tool, then you can provide a 3rd-party API if
> you want. Or I'll withdraw this series and go back to my original
> plan.

FWIW, I am very much in favor of this series, and would have found
something like this very useful many times in the past when I was
digging into the index code.  So I'd be more than happy to just have
this as debug tool, rather than as 3rd-party API.

I'd also be fine with this living in test-tool, as long as it's
somewhere in git.git where it's easily usable, I'd find it helpful.

Thanks for working on this!

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 01/10] ls-files: add --json to dump the index
  2019-06-24 13:02 ` [PATCH v2 01/10] ls-files: add --json to dump the index Nguyễn Thái Ngọc Duy
  2019-06-24 19:15   ` Jeff Hostetler
  2019-06-25  9:05   ` Thomas Gummerer
@ 2019-06-25  9:44   ` Johannes Schindelin
  2019-06-25 11:31     ` Johannes Schindelin
  2019-06-25 22:28     ` Junio C Hamano
  2019-06-26 19:51   ` Junio C Hamano
  3 siblings, 2 replies; 43+ messages in thread
From: Johannes Schindelin @ 2019-06-25  9:44 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: git, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason

[-- Attachment #1: Type: text/plain, Size: 864 bytes --]

Hi Duy,

On Mon, 24 Jun 2019, Nguyễn Thái Ngọc Duy wrote:

> diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
> new file mode 100755
> index 0000000000..97bcd814be
> --- /dev/null
> +++ b/t/t3011-ls-files-json.sh
> @@ -0,0 +1,44 @@
> +#!/bin/sh
> +
> +test_description='ls-files dumping json'
> +
> +. ./test-lib.sh
> +
> +strip_number() {
> +	for name; do
> +		echo 's/\("'$name'":\) [0-9]\+/\1 <number>/' >>filter.sed

This does not do what you think it does, in Ubuntu Xenial and on macOS:

https://dev.azure.com/gitgitgadget/git/_build/results?buildId=11408&view=ms.vss-test-web.build-test-results-tab&runId=27736&paneView=debug&resultId=105613

The `\1` is expanded to the ASCII character 001. Therefore your test cases
fail on almost all platforms.

Funnily enough, they pass on Windows...

Ciao,
Johannes

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 01/10] ls-files: add --json to dump the index
  2019-06-24 19:15   ` Jeff Hostetler
  2019-06-24 20:04     ` Junio C Hamano
  2019-06-25  9:21     ` Johannes Schindelin
@ 2019-06-25  9:52     ` Duy Nguyen
  2019-06-25 15:37       ` Jeff Hostetler
  2 siblings, 1 reply; 43+ messages in thread
From: Duy Nguyen @ 2019-06-25  9:52 UTC (permalink / raw)
  To: Jeff Hostetler
  Cc: Git Mailing List, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin

On Tue, Jun 25, 2019 at 2:15 AM Jeff Hostetler <git@jeffhostetler.com> wrote:
> > @@ -202,6 +202,28 @@ void jw_object_null(struct json_writer *jw, const char *key)
> >       strbuf_addstr(&jw->json, "null");
> >   }
> >
> > +void jw_object_filemode(struct json_writer *jw, const char *key, mode_t mode)
> > +{
> > +     object_common(jw, key);
> > +     strbuf_addf(&jw->json, "\"%06o\"", mode);
> > +}
> > +
> > +void jw_object_stat_data(struct json_writer *jw, const char *name,
> > +                      const struct stat_data *sd)
>
> Should this be in json_writer.c or in read-cache.c ?
> Currently, json_writer.c is concerned with formatting
> JSON on basic/scalar types.  Do we want to start
> extending it to handle arbitrary structures?  Or would
> it be better for the code that defines/manipulates the
> structure to define a "stat_data_dump_json()" function.
>
> I'm torn on the jw_object_filemode() function, JSON format
> limits us to decimal integers and there are places where
> I'd like to have hex, or in this case octal values.
>
> I'm thinking it'd be better to have a helper function in
> read-cache.c that formats a local strbuf and calls
> js_object_string(&jw, key, buf);

I can move these back to read-cache.c. Though if we have a lot more jw
helpers like this (hard to tell at the moment) then perhaps we can
have json-writer-utils.c or something to group them together. That
keep the "boring" code out of main logic code in read-cache.c and
other call sites.

> > @@ -1952,6 +1953,49 @@ static void *load_index_extensions(void *_data)
> >       return NULL;
> >   }
> >
> > +static void dump_cache_entry(struct index_state *istate,
> > +                          int index,
> > +                          unsigned long offset,
> > +                          const struct cache_entry *ce)
> > +{
> > +     struct json_writer *jw = istate->jw;
> > +
> > +     jw_array_inline_begin_object(jw);
> > +
> > +     /*
> > +      * this is technically redundant, but it's for easier
> > +      * navigation when there hundreds of entries
> > +      */
> > +     jw_object_intmax(jw, "id", index);
> > +
> > +     jw_object_string(jw, "name", ce->name);
> > +
> > +     jw_object_filemode(jw, "mode", ce->ce_mode);
> > +
> > +     jw_object_intmax(jw, "flags", ce->ce_flags);
>
> It would be nice to have the flags as a hex-formatted string
> in addition to (or instead of) the decimal integer value.

I'm not against reformatting it in hex string, but is there a value in
it? ce_flags is expanded in the code below so that you don't have to
decode it yourself when you read each entry. The "flags" field here is
for further processing in tools. I'm trying to see if looking at hex
values helps, but I'm still not seeing it...

> > +     /*
> > +      * again redundant info, just so you don't have to decode
> > +      * flags values manually
> > +      */
> > +     if (ce->ce_flags & CE_EXTENDED)
> > +             jw_object_true(jw, "extended_flags");
> > +     if (ce->ce_flags & CE_VALID)
> > +             jw_object_true(jw, "assume_unchanged");
> > +     if (ce->ce_flags & CE_INTENT_TO_ADD)
> > +             jw_object_true(jw, "intent_to_add");
> > +     if (ce->ce_flags & CE_SKIP_WORKTREE)
> > +             jw_object_true(jw, "skip_worktree");
> > +     if (ce_stage(ce))
> > +             jw_object_intmax(jw, "stage", ce_stage(ce));
-- 
Duy

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 05/10] split-index.c: dump "link" extension as json
  2019-06-24 20:06   ` Jeff Hostetler
@ 2019-06-25 10:29     ` Duy Nguyen
  2019-06-25 12:40       ` Derrick Stolee
  0 siblings, 1 reply; 43+ messages in thread
From: Duy Nguyen @ 2019-06-25 10:29 UTC (permalink / raw)
  To: Jeff Hostetler
  Cc: Git Mailing List, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin

On Tue, Jun 25, 2019 at 3:06 AM Jeff Hostetler <git@jeffhostetler.com> wrote:
> I'm curious how big these EWAHs will be in practice and
> how useful an array of integers will be (especially as the
> pretty format will be one integer per line).  Perhaps it
> would helpful to have an extended example in one of the
> tests.

It's one integer per updated entry. So if you have a giant index and
updated every single one of them, the EWAH bitmap contains that many
integers.

If it was easy to just merge these bitmaps back to the entry (e.g. in
this example, add "replaced": true to entry zero) I would have done
it. But we dump as we stream and it's already too late to do it.

> Would it be better to have the caller of ewah_each_bit()
> build a hex or bit string in a strbuf and then write it
> as a single string?

I don't think the current EWAH representation is easy to read in the
first place. You'll probably have to run through some script to update
the main entries part and will have a much better view, but that's
pretty quick. If it's for scripts, then it's probably best to keep as
an array of integers, not a string. Less post processing.

Another reason for not merging to one string (might not be a very good
argument though) is to help diff between two indexes.
One-number-per-line works well with "git diff --no-index" while one
long string is a bit harder. I did this kind of comparison when I made
changes in read-cache.c and wanted to check if the new index file is
completely broken, or just slighly broken.
-- 
Duy

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json
  2019-06-25  9:05   ` Duy Nguyen
  2019-06-25  9:38     ` Thomas Gummerer
@ 2019-06-25 11:27     ` Johannes Schindelin
  2019-06-25 12:06       ` Duy Nguyen
  1 sibling, 1 reply; 43+ messages in thread
From: Johannes Schindelin @ 2019-06-25 11:27 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Git Mailing List, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason

Hi Duy,

On Tue, 25 Jun 2019, Duy Nguyen wrote:

> On Tue, Jun 25, 2019 at 1:00 AM Johannes Schindelin
> <Johannes.Schindelin@gmx.de> wrote:
> > > - extension location is printed, in case you need to decode the
> > >   extension by yourself (previously only the size is printed)
> > > - all extensions are printed in the same order they appear in the file
> > >   (previously eoie and ieot are printed first because that's how we
> > >   parse)
> > > - resolve undo extension is reorganized a bit to be easier to read
> > > - tests added. Example json files are in t/t3011
> >
> > It might actually make sense to optionally disable showing extensions.
> >
> > You also forgot to mention that you explicitly disable handling
> > `<pathspec>`, which I find a bit odd, personally, as that would probably
> > come in real handy at times,
>
> No. I mentioned the land of high level languages before. Filtering in
> any Python, Ruby, Scheme, JavaScript, Java is a piece of cake and much
> more flexible than pathspec.

I heard that type of argument before. I was working on the initial Windows
port of Git, uh, of course I was working on a big refactoring of a big C++
application backed by a database. A colleague suggested that filtering
could be done much better in C++, on the desktop, than in SQL. And so they
changed the paradigm to "simplify" the SQL query, and instead dropped the
unwanted data after it had hit the RAM of the client machine.

Turns out it was a bad idea. A _really_ bad idea. Because it required
downloading 30MB of data for about several dozens computers in parallel,
at the start of every shift.

This change was reverted in one big hurry, and the colleague was tasked to
learn them some SQL.

Why am I telling you this story? Because you fall into the exact same trap
as my colleague.

In this instance, it may not be so much network bandwidth, but it is still
quite a suboptimal idea to render JSON for possibly tens of thousands of
files, then parse the same JSON on the receiving side, the spend even more
time to drop all but a dozen files.

And this is _even more_ relevant when you want to debug things.

In short: I am quite puzzled why this is even debated here. There is a
reason, a good reason, why `git ls-files` accepts pathspecs. I would not
want to ignore the lessons of history as willfully here.

> Even with shell scripts, jq could do a much better job than pathspec. If
> you filter by pathspec, good luck trying that on extensions.

You keep harping on extensions, but the reality of the matter is that they
are rarely interesting. I would even wager a bet that we will end up
excluding them from the JSON output by default.

Most of the times when I had to decode the index file manually in the
past, it was about the regular file entries.

There was *one* week in which I had to decode the untracked cache a bit,
to the point where I patched the test helper locally to help me with that.

If my experience in debugging these things is any indicator, extensions do
not matter even 10% of the non-extension data.

And that's not even taking into account the third-party software that
could definitely benefit from having this JSON format as query result.

In my work as Git for Windows maintainer, I do hear about the needs of
third-party software developers quite a bit, so I would claim that I know
a bit about what they need, why the NUL-terminated format is not a good
match, and how much a JSON-based API would help.

So while that is not your pet, it will be the most useful part of the
outcome of your work.

> It's the same reason why I will not provide a flexible way to disable
> extensions. I'm not starting a JSON API for Git. I provide an index
> file in JSON format. You do what you want with it. You have a format
> easy enough to import to native data structures of your favorite
> language.

I understand that you don't care.

Your patch series is just too good a start on something truly useful to
pass up on the opportunity.

> > especially when we offer this as a better way for 3rd-party
> > applications to interact with Git (which I think will be the use case
> > for this feature that will be _far_ more common than using it for
> > debugging).
>
> We may have conflicting goals. For me, first priority is the debug
> tool for Git developers. 3rd-party support is a stretch. I could move
> all this back to test-tool, then you can provide a 3rd-party API if
> you want. Or I'll withdraw this series and go back to my original
> plan.

You don't need JSON if you want to debug things. That would be a lot of
love lost, if debugging was your goal.

I guess I'll wait until your patch series hits `next`, and then try to
find some time to work on that feature.

Ciao,
Johannes

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 01/10] ls-files: add --json to dump the index
  2019-06-25  9:44   ` Johannes Schindelin
@ 2019-06-25 11:31     ` Johannes Schindelin
  2019-06-25 13:57       ` Johannes Schindelin
  2019-06-25 22:28     ` Junio C Hamano
  1 sibling, 1 reply; 43+ messages in thread
From: Johannes Schindelin @ 2019-06-25 11:31 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: git, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason

[-- Attachment #1: Type: text/plain, Size: 3096 bytes --]

Hi Duy,

On Tue, 25 Jun 2019, Johannes Schindelin wrote:

> On Mon, 24 Jun 2019, Nguyễn Thái Ngọc Duy wrote:
>
> > diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
> > new file mode 100755
> > index 0000000000..97bcd814be
> > --- /dev/null
> > +++ b/t/t3011-ls-files-json.sh
> > @@ -0,0 +1,44 @@
> > +#!/bin/sh
> > +
> > +test_description='ls-files dumping json'
> > +
> > +. ./test-lib.sh
> > +
> > +strip_number() {
> > +	for name; do
> > +		echo 's/\("'$name'":\) [0-9]\+/\1 <number>/' >>filter.sed
>
> This does not do what you think it does, in Ubuntu Xenial and on macOS:
>
> https://dev.azure.com/gitgitgadget/git/_build/results?buildId=11408&view=ms.vss-test-web.build-test-results-tab&runId=27736&paneView=debug&resultId=105613
>
> The `\1` is expanded to the ASCII character 001. Therefore your test cases
> fail on almost all platforms.

The `strip_number()`/`strip_string()` approach might look elegant from a
design perspective, but from a readability perspective (and obviously,
when one wants to make those tests more robust and cross-platform), it
would be a lot better to do it more explicitly.

This patch on top of your patch series makes the test run correctly in my
Linux and Windows setup, and much easier to understand:

-- snipsnap --
diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
index 9f4ad4c9cf..8b782c48e0 100755
--- a/t/t3011-ls-files-json.sh
+++ b/t/t3011-ls-files-json.sh
@@ -4,18 +4,6 @@ test_description='ls-files dumping json'

 . ./test-lib.sh

-strip_number() {
-	for name; do
-		echo 's/\("'$name'":\) [0-9]\+/\1 <number>/' >>filter.sed
-	done
-}
-
-strip_string() {
-	for name; do
-		echo 's/\("'$name'":\) ".*"/\1 <string>/' >>filter.sed
-	done
-}
-
 compare_json() {
 	git ls-files --debug-json >json &&
 	sed -f filter.sed json >filtered &&
@@ -35,9 +23,21 @@ test_expect_success 'setup' '
 	echo intent-to-add >ita &&
 	git add -N ita &&

-	strip_number ctime_sec ctime_nsec mtime_sec mtime_nsec &&
-	strip_number device inode uid gid file_offset ext_size last_update &&
-	strip_string oid ident
+	cat >filter.sed <<-\EOF
+	s/\("ctime_sec":\) [0-9]\+/\1 <number>/
+	s/\("ctime_nsec":\) [0-9]\+/\1 <number>/
+	s/\("mtime_sec":\) [0-9]\+/\1 <number>/
+	s/\("mtime_nsec":\) [0-9]\+/\1 <number>/
+	s/\("device":\) [0-9]\+/\1 <number>/
+	s/\("inode":\) [0-9]\+/\1 <number>/
+	s/\("uid":\) [0-9]\+/\1 <number>/
+	s/\("gid":\) [0-9]\+/\1 <number>/
+	s/\("file_offset":\) [0-9]\+/\1 <number>/
+	s/\("ext_size":\) [0-9]\+/\1 <number>/
+	s/\("last_update":\) [0-9]\+/\1 <number>/
+	s/\("oid":\) ".*"/\1 <string>/
+	s/\("ident":\) ".*"/\1 <string>/
+	EOF
 '

 test_expect_success 'ls-files --json, main entries, UNTR and TREE' '
@@ -98,7 +98,9 @@ test_expect_success !SINGLE_CPU 'ls-files --json and multicore extensions' '
 		touch one two three four &&
 		git add . &&
 		cp ../filter.sed . &&
-		strip_number offset &&
+		cat >>filter.sed <<-\EOF &&
+		s/\("offset":\) [0-9]\+/\1 <number>/
+		EOF
 		compare_json eoie
 	)
 '

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json
  2019-06-25 11:27     ` Johannes Schindelin
@ 2019-06-25 12:06       ` Duy Nguyen
  2019-06-25 14:10         ` Johannes Schindelin
  0 siblings, 1 reply; 43+ messages in thread
From: Duy Nguyen @ 2019-06-25 12:06 UTC (permalink / raw)
  To: Johannes Schindelin
  Cc: Git Mailing List, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason

On Tue, Jun 25, 2019 at 6:27 PM Johannes Schindelin
<Johannes.Schindelin@gmx.de> wrote:
>
> Hi Duy,
>
> On Tue, 25 Jun 2019, Duy Nguyen wrote:
>
> > On Tue, Jun 25, 2019 at 1:00 AM Johannes Schindelin
> > <Johannes.Schindelin@gmx.de> wrote:
> > > > - extension location is printed, in case you need to decode the
> > > >   extension by yourself (previously only the size is printed)
> > > > - all extensions are printed in the same order they appear in the file
> > > >   (previously eoie and ieot are printed first because that's how we
> > > >   parse)
> > > > - resolve undo extension is reorganized a bit to be easier to read
> > > > - tests added. Example json files are in t/t3011
> > >
> > > It might actually make sense to optionally disable showing extensions.
> > >
> > > You also forgot to mention that you explicitly disable handling
> > > `<pathspec>`, which I find a bit odd, personally, as that would probably
> > > come in real handy at times,
> >
> > No. I mentioned the land of high level languages before. Filtering in
> > any Python, Ruby, Scheme, JavaScript, Java is a piece of cake and much
> > more flexible than pathspec.
>
> I heard that type of argument before. I was working on the initial Windows
> port of Git, uh, of course I was working on a big refactoring of a big C++
> application backed by a database. A colleague suggested that filtering
> could be done much better in C++, on the desktop, than in SQL. And so they
> changed the paradigm to "simplify" the SQL query, and instead dropped the
> unwanted data after it had hit the RAM of the client machine.
>
> Turns out it was a bad idea. A _really_ bad idea. Because it required
> downloading 30MB of data for about several dozens computers in parallel,
> at the start of every shift.
>
> This change was reverted in one big hurry, and the colleague was tasked to
> learn them some SQL.
>
> Why am I telling you this story? Because you fall into the exact same trap
> as my colleague.
>
> In this instance, it may not be so much network bandwidth, but it is still
> quite a suboptimal idea to render JSON for possibly tens of thousands of
> files, then parse the same JSON on the receiving side, the spend even more
> time to drop all but a dozen files.

This was mentioned before [1]. Of course I don't work on giant index
files, but I would assume the cost of parsing JSON (at least with a
stream-based one, not loading the whole thing in core) is still
cheaper. And you could still do it in iteration, saving every step
until you have the reasonable small dataset to work on. The other side
of the story is, are we sure parsing and executing pathspec is cheap?
I'm not so sure, especially when pathspec code is not exactly
optimized.

Consider the amount of code to support something like that. I'd rather
wait until a real example come up and no good solution found without
modify git.git, before actually supporting it.

[1] https://public-inbox.org/git/45e49624-be8e-deff-bf9d-aee052991189@gmail.com/

> And this is _even more_ relevant when you want to debug things.
>
> In short: I am quite puzzled why this is even debated here. There is a
> reason, a good reason, why `git ls-files` accepts pathspecs. I would not
> want to ignore the lessons of history as willfully here.

I guess you and I have different ways of debugging things.

> > Even with shell scripts, jq could do a much better job than pathspec. If
> > you filter by pathspec, good luck trying that on extensions.
>
> You keep harping on extensions, but the reality of the matter is that they
> are rarely interesting. I would even wager a bet that we will end up
> excluding them from the JSON output by default.
>
> Most of the times when I had to decode the index file manually in the
> past, it was about the regular file entries.
>
> There was *one* week in which I had to decode the untracked cache a bit,
> to the point where I patched the test helper locally to help me with that.
>
> If my experience in debugging these things is any indicator, extensions do
> not matter even 10% of the non-extension data.

Again our experiences differ. Mine is mostly about extensions,
probably because I had to work on them more often. For normal entries
"ls-files --debug" gives you 99% what's in the index file already.

> > > especially when we offer this as a better way for 3rd-party
> > > applications to interact with Git (which I think will be the use case
> > > for this feature that will be _far_ more common than using it for
> > > debugging).
> >
> > We may have conflicting goals. For me, first priority is the debug
> > tool for Git developers. 3rd-party support is a stretch. I could move
> > all this back to test-tool, then you can provide a 3rd-party API if
> > you want. Or I'll withdraw this series and go back to my original
> > plan.
>
> You don't need JSON if you want to debug things. That would be a lot of
> love lost, if debugging was your goal.

No, I did think of some other line-based format before I ended up with
JSON. I did not want to use it in the beginning.

The thing is, a giant table to cover all fields and entries in the
main index is not as easy to navigate, or search even in 'less'. And
the hierarchical structure of some extensions is hard to represent in
good way (at least without writing lots of code). On top of that I
still need some easy way to parse and post-process, like how much
saving I could gain if I compressed stat data. And the final nail is
json-writer.c is already there, much less work.

So JSON was the best option I found to meet all those points.
-- 
Duy

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 05/10] split-index.c: dump "link" extension as json
  2019-06-25 10:29     ` Duy Nguyen
@ 2019-06-25 12:40       ` Derrick Stolee
  2019-06-27 10:48         ` Duy Nguyen
  0 siblings, 1 reply; 43+ messages in thread
From: Derrick Stolee @ 2019-06-25 12:40 UTC (permalink / raw)
  To: Duy Nguyen, Jeff Hostetler
  Cc: Git Mailing List, Junio C Hamano, Jeff King,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin

On 6/25/2019 6:29 AM, Duy Nguyen wrote:
> On Tue, Jun 25, 2019 at 3:06 AM Jeff Hostetler <git@jeffhostetler.com> wrote:
>> I'm curious how big these EWAHs will be in practice and
>> how useful an array of integers will be (especially as the
>> pretty format will be one integer per line).  Perhaps it
>> would helpful to have an extended example in one of the
>> tests.
> 
> It's one integer per updated entry. So if you have a giant index and
> updated every single one of them, the EWAH bitmap contains that many
> integers.
> 
> If it was easy to just merge these bitmaps back to the entry (e.g. in
> this example, add "replaced": true to entry zero) I would have done
> it. But we dump as we stream and it's already too late to do it.
> 
>> Would it be better to have the caller of ewah_each_bit()
>> build a hex or bit string in a strbuf and then write it
>> as a single string?
> 
> I don't think the current EWAH representation is easy to read in the
> first place. You'll probably have to run through some script to update
> the main entries part and will have a much better view, but that's
> pretty quick. If it's for scripts, then it's probably best to keep as
> an array of integers, not a string. Less post processing.

I don't think the intent is to dump the EWAH directly, but instead to
dump a string of the uncompressed bitmap. Something like:

	"delete_bitmap" : "01101101101"

instead of

	"delete_bitmap" : [ 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1 ]

> Another reason for not merging to one string (might not be a very good
> argument though) is to help diff between two indexes.
> One-number-per-line works well with "git diff --no-index" while one
> long string is a bit harder. I did this kind of comparison when I made
> changes in read-cache.c and wanted to check if the new index file is
> completely broken, or just slighly broken.

You're right that the diff of the json output is an interesting
use, and the "single string" output is not helpful. What about
batches of 64-bit strings? For example:

	"delete_bitmap" : [
		"0101010101010101010101010101010101010101010101010101010101010101",
		"0101010101010101010101010101010101010101010101010101010101010101",
		"0101010101010101010101010101010101010101010101010101010101010101",
		"01010101010101"
	]

This could be a happy medium between the two options, but does require
some extra work in the formatter.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 01/10] ls-files: add --json to dump the index
  2019-06-25 11:31     ` Johannes Schindelin
@ 2019-06-25 13:57       ` Johannes Schindelin
  0 siblings, 0 replies; 43+ messages in thread
From: Johannes Schindelin @ 2019-06-25 13:57 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: git, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason

Hi Duy,

On Tue, 25 Jun 2019, Johannes Schindelin wrote:

> diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
> index 9f4ad4c9cf..8b782c48e0 100755
> --- a/t/t3011-ls-files-json.sh
> +++ b/t/t3011-ls-files-json.sh
> @@ -4,18 +4,6 @@ test_description='ls-files dumping json'
>
>  . ./test-lib.sh
>
> -strip_number() {
> -	for name; do
> -		echo 's/\("'$name'":\) [0-9]\+/\1 <number>/' >>filter.sed
> -	done
> -}
> -
> -strip_string() {
> -	for name; do
> -		echo 's/\("'$name'":\) ".*"/\1 <string>/' >>filter.sed
> -	done
> -}
> -
>  compare_json() {
>  	git ls-files --debug-json >json &&
>  	sed -f filter.sed json >filtered &&
> @@ -35,9 +23,21 @@ test_expect_success 'setup' '
>  	echo intent-to-add >ita &&
>  	git add -N ita &&
>
> -	strip_number ctime_sec ctime_nsec mtime_sec mtime_nsec &&
> -	strip_number device inode uid gid file_offset ext_size last_update &&
> -	strip_string oid ident
> +	cat >filter.sed <<-\EOF
> +	s/\("ctime_sec":\) [0-9]\+/\1 <number>/

And of course, \+ still isn't POSIX, so you have to write [0-9][1-9]*
instead.

Ciao,
Johannes

> +	s/\("ctime_nsec":\) [0-9]\+/\1 <number>/
> +	s/\("mtime_sec":\) [0-9]\+/\1 <number>/
> +	s/\("mtime_nsec":\) [0-9]\+/\1 <number>/
> +	s/\("device":\) [0-9]\+/\1 <number>/
> +	s/\("inode":\) [0-9]\+/\1 <number>/
> +	s/\("uid":\) [0-9]\+/\1 <number>/
> +	s/\("gid":\) [0-9]\+/\1 <number>/
> +	s/\("file_offset":\) [0-9]\+/\1 <number>/
> +	s/\("ext_size":\) [0-9]\+/\1 <number>/
> +	s/\("last_update":\) [0-9]\+/\1 <number>/
> +	s/\("oid":\) ".*"/\1 <string>/
> +	s/\("ident":\) ".*"/\1 <string>/
> +	EOF
>  '
>
>  test_expect_success 'ls-files --json, main entries, UNTR and TREE' '
> @@ -98,7 +98,9 @@ test_expect_success !SINGLE_CPU 'ls-files --json and multicore extensions' '
>  		touch one two three four &&
>  		git add . &&
>  		cp ../filter.sed . &&
> -		strip_number offset &&
> +		cat >>filter.sed <<-\EOF &&
> +		s/\("offset":\) [0-9]\+/\1 <number>/
> +		EOF
>  		compare_json eoie
>  	)
>  '

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json
  2019-06-25 12:06       ` Duy Nguyen
@ 2019-06-25 14:10         ` Johannes Schindelin
  2019-06-25 17:08           ` Ramsay Jones
  0 siblings, 1 reply; 43+ messages in thread
From: Johannes Schindelin @ 2019-06-25 14:10 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Git Mailing List, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason

Hi Duy,

On Tue, 25 Jun 2019, Duy Nguyen wrote:

> On Tue, Jun 25, 2019 at 6:27 PM Johannes Schindelin
> <Johannes.Schindelin@gmx.de> wrote:
> >
> > On Tue, 25 Jun 2019, Duy Nguyen wrote:
> >
> > > On Tue, Jun 25, 2019 at 1:00 AM Johannes Schindelin
> > > <Johannes.Schindelin@gmx.de> wrote:
> > > > > - extension location is printed, in case you need to decode the
> > > > >   extension by yourself (previously only the size is printed)
> > > > > - all extensions are printed in the same order they appear in the file
> > > > >   (previously eoie and ieot are printed first because that's how we
> > > > >   parse)
> > > > > - resolve undo extension is reorganized a bit to be easier to read
> > > > > - tests added. Example json files are in t/t3011
> > > >
> > > > It might actually make sense to optionally disable showing extensions.
> > > >
> > > > You also forgot to mention that you explicitly disable handling
> > > > `<pathspec>`, which I find a bit odd, personally, as that would probably
> > > > come in real handy at times,
> > >
> > > No. I mentioned the land of high level languages before. Filtering in
> > > any Python, Ruby, Scheme, JavaScript, Java is a piece of cake and much
> > > more flexible than pathspec.
> >
> > I heard that type of argument before. I was working on the initial Windows
> > port of Git, uh, of course I was working on a big refactoring of a big C++
> > application backed by a database. A colleague suggested that filtering
> > could be done much better in C++, on the desktop, than in SQL. And so they
> > changed the paradigm to "simplify" the SQL query, and instead dropped the
> > unwanted data after it had hit the RAM of the client machine.
> >
> > Turns out it was a bad idea. A _really_ bad idea. Because it required
> > downloading 30MB of data for about several dozens computers in parallel,
> > at the start of every shift.
> >
> > This change was reverted in one big hurry, and the colleague was tasked to
> > learn them some SQL.
> >
> > Why am I telling you this story? Because you fall into the exact same trap
> > as my colleague.
> >
> > In this instance, it may not be so much network bandwidth, but it is still
> > quite a suboptimal idea to render JSON for possibly tens of thousands of
> > files, then parse the same JSON on the receiving side, the spend even more
> > time to drop all but a dozen files.
>
> This was mentioned before [1]. Of course I don't work on giant index
> files, but I would assume the cost of parsing JSON (at least with a
> stream-based one, not loading the whole thing in core) is still
> cheaper.

You may have heard that a few thousand of my colleagues are working on
what they call the largest repository on this planet.

No, the cost of parsing JSON only to throw away the majority of the parsed
information is not cheap. It is a clear sign of a design in want of being
improved.

> And you could still do it in iteration, saving every step until you have
> the reasonable small dataset to work on. The other side of the story is,
> are we sure parsing and executing pathspec is cheap? I'm not so sure,
> especially when pathspec code is not exactly optimized.

Let's not try to slap on workaround over workaround. Let's fix the root
cause. (Being: don't filter at the wrong end.)

> Consider the amount of code to support something like that.

Given that I am pretty familiar with the pathspec machinery due to working
with it in the `git stash` and `git add -p` built-ins, I have a very easy
time considering the amount of code. It makes me smile how little code
will be needed.

> I'd rather wait until a real example come up and no good solution found
> without modify git.git, before actually supporting it.

Oh hey, there you go: Team Explorer. Visual Studio Code. Literally every
single 3rd-party application that needs to deal with real-world loads.
Every single one.

> > And this is _even more_ relevant when you want to debug things.
> >
> > In short: I am quite puzzled why this is even debated here. There is a
> > reason, a good reason, why `git ls-files` accepts pathspecs. I would not
> > want to ignore the lessons of history as willfully here.
>
> I guess you and I have different ways of debugging things.

Yep, I'm with Lincoln here: Give me six hours to debug a problem and I
will spend the first four optimizing the edit-build-test cycle.

> > > Even with shell scripts, jq could do a much better job than pathspec. If
> > > you filter by pathspec, good luck trying that on extensions.
> >
> > You keep harping on extensions, but the reality of the matter is that they
> > are rarely interesting. I would even wager a bet that we will end up
> > excluding them from the JSON output by default.
> >
> > Most of the times when I had to decode the index file manually in the
> > past, it was about the regular file entries.
> >
> > There was *one* week in which I had to decode the untracked cache a bit,
> > to the point where I patched the test helper locally to help me with that.
> >
> > If my experience in debugging these things is any indicator, extensions do
> > not matter even 10% of the non-extension data.
>
> Again our experiences differ. Mine is mostly about extensions,
> probably because I had to work on them more often. For normal entries
> "ls-files --debug" gives you 99% what's in the index file already.

Like the device. And the ctime. And the file size. And the uid/gid. Is
that what you mean?

I don't know whether I missed a joke or not.

> > You don't need JSON if you want to debug things. That would be a lot of
> > love lost, if debugging was your goal.
>
> No, I did think of some other line-based format before I ended up with
> JSON. I did not want to use it in the beginning.

Then why bother.

> The thing is, a giant table to cover all fields and entries in the
> main index is not as easy to navigate, or search even in 'less'. And
> the hierarchical structure of some extensions is hard to represent in
> good way (at least without writing lots of code). On top of that I
> still need some easy way to parse and post-process, like how much
> saving I could gain if I compressed stat data. And the final nail is
> json-writer.c is already there, much less work.
>
> So JSON was the best option I found to meet all those points.

Well, as I said: you're obviously dead-set to optimize this for debugging
your own problems. The beauty of open source is that it can be turned into
something of wider use.

Ciao,
Johannes

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 01/10] ls-files: add --json to dump the index
  2019-06-25  9:52     ` Duy Nguyen
@ 2019-06-25 15:37       ` Jeff Hostetler
  0 siblings, 0 replies; 43+ messages in thread
From: Jeff Hostetler @ 2019-06-25 15:37 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Git Mailing List, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin



On 6/25/2019 5:52 AM, Duy Nguyen wrote:
> On Tue, Jun 25, 2019 at 2:15 AM Jeff Hostetler <git@jeffhostetler.com> wrote:
>>> @@ -202,6 +202,28 @@ void jw_object_null(struct json_writer *jw, const char *key)
>>>        strbuf_addstr(&jw->json, "null");
>>>    }
>>>
>>> +void jw_object_filemode(struct json_writer *jw, const char *key, mode_t mode)
>>> +{
>>> +     object_common(jw, key);
>>> +     strbuf_addf(&jw->json, "\"%06o\"", mode);
>>> +}
>>> +
>>> +void jw_object_stat_data(struct json_writer *jw, const char *name,
>>> +                      const struct stat_data *sd)
>>
>> Should this be in json_writer.c or in read-cache.c ?
>> Currently, json_writer.c is concerned with formatting
>> JSON on basic/scalar types.  Do we want to start
>> extending it to handle arbitrary structures?  Or would
>> it be better for the code that defines/manipulates the
>> structure to define a "stat_data_dump_json()" function.
>>
>> I'm torn on the jw_object_filemode() function, JSON format
>> limits us to decimal integers and there are places where
>> I'd like to have hex, or in this case octal values.
>>
>> I'm thinking it'd be better to have a helper function in
>> read-cache.c that formats a local strbuf and calls
>> js_object_string(&jw, key, buf);
> 
> I can move these back to read-cache.c. Though if we have a lot more jw
> helpers like this (hard to tell at the moment) then perhaps we can
> have json-writer-utils.c or something to group them together. That
> keep the "boring" code out of main logic code in read-cache.c and
> other call sites.

yeah, in an utils file or close to the "constructors" of the
structure types.  either one works.

> 
>>> @@ -1952,6 +1953,49 @@ static void *load_index_extensions(void *_data)
>>>        return NULL;
>>>    }
>>>
>>> +static void dump_cache_entry(struct index_state *istate,
>>> +                          int index,
>>> +                          unsigned long offset,
>>> +                          const struct cache_entry *ce)
>>> +{
>>> +     struct json_writer *jw = istate->jw;
>>> +
>>> +     jw_array_inline_begin_object(jw);
>>> +
>>> +     /*
>>> +      * this is technically redundant, but it's for easier
>>> +      * navigation when there hundreds of entries
>>> +      */
>>> +     jw_object_intmax(jw, "id", index);
>>> +
>>> +     jw_object_string(jw, "name", ce->name);
>>> +
>>> +     jw_object_filemode(jw, "mode", ce->ce_mode);
>>> +
>>> +     jw_object_intmax(jw, "flags", ce->ce_flags);
>>
>> It would be nice to have the flags as a hex-formatted string
>> in addition to (or instead of) the decimal integer value.
> 
> I'm not against reformatting it in hex string, but is there a value in
> it? ce_flags is expanded in the code below so that you don't have to
> decode it yourself when you read each entry. The "flags" field here is
> for further processing in tools. I'm trying to see if looking at hex
> values helps, but I'm still not seeing it...
> 

I guess I was thinking of the in-memory bits and thinking
it'd be useful to be able to dump the index immediately
after reading it and then later after some operation or
traversal and see the intermediate states.  But I realize
now that that's not what you're building.  This is a dump
it while you're reading it feature (and that's fine).

So, as long as you have all of the on-disk bits, we should
be fine as you suggest.

Jeff


>>> +     /*
>>> +      * again redundant info, just so you don't have to decode
>>> +      * flags values manually
>>> +      */
>>> +     if (ce->ce_flags & CE_EXTENDED)
>>> +             jw_object_true(jw, "extended_flags");
>>> +     if (ce->ce_flags & CE_VALID)
>>> +             jw_object_true(jw, "assume_unchanged");
>>> +     if (ce->ce_flags & CE_INTENT_TO_ADD)
>>> +             jw_object_true(jw, "intent_to_add");
>>> +     if (ce->ce_flags & CE_SKIP_WORKTREE)
>>> +             jw_object_true(jw, "skip_worktree");
>>> +     if (ce_stage(ce))
>>> +             jw_object_intmax(jw, "stage", ce_stage(ce));

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json
  2019-06-25 14:10         ` Johannes Schindelin
@ 2019-06-25 17:08           ` Ramsay Jones
  2019-06-26 15:05             ` Johannes Schindelin
  0 siblings, 1 reply; 43+ messages in thread
From: Ramsay Jones @ 2019-06-25 17:08 UTC (permalink / raw)
  To: Johannes Schindelin, Duy Nguyen
  Cc: Git Mailing List, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason



On 25/06/2019 15:10, Johannes Schindelin wrote:
> Hi Duy,
[snip]

>> Again our experiences differ. Mine is mostly about extensions,
>> probably because I had to work on them more often. For normal entries
>> "ls-files --debug" gives you 99% what's in the index file already.
> 
> Like the device. And the ctime. And the file size. And the uid/gid. Is
> that what you mean?

Hmm, well I think so:

  $ git ls-files --debug git.c git-compat-util.h 
  git-compat-util.h
    ctime: 1561457278:502638001
    mtime: 1561457278:502638001
    dev: 2049	ino: 262663
    uid: 1000	gid: 1000
    size: 35440	flags: 0
  git.c
    ctime: 1561457278:518646000
    mtime: 1561457278:518646000
    dev: 2049	ino: 263083
    uid: 1000	gid: 1000
    size: 26837	flags: 0
  $ 

I have occasionally added stuff to the '--debug' output
while debugging something, but the above is usually
sufficient for my uses. (Having said that, I have not
had the need to debug extensions [yet!]).

ATB,
Ramsay Jones

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 01/10] ls-files: add --json to dump the index
  2019-06-25  9:44   ` Johannes Schindelin
  2019-06-25 11:31     ` Johannes Schindelin
@ 2019-06-25 22:28     ` Junio C Hamano
  1 sibling, 0 replies; 43+ messages in thread
From: Junio C Hamano @ 2019-06-25 22:28 UTC (permalink / raw)
  To: Johannes Schindelin
  Cc: Nguyễn Thái Ngọc Duy, git, Jeff King,
	Derrick Stolee, Ævar Arnfjörð Bjarmason

Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

>> +		echo 's/\("'$name'":\) [0-9]\+/\1 <number>/' >>filter.sed
>
> This does not do what you think it does, in Ubuntu Xenial and on macOS:
>
> https://dev.azure.com/gitgitgadget/git/_build/results?buildId=11408&view=ms.vss-test-web.build-test-results-tab&runId=27736&paneView=debug&resultId=105613
>
> The `\1` is expanded to the ASCII character 001. Therefore your test cases
> fail on almost all platforms.
>
> Funnily enough, they pass on Windows...

bash, dash and /bin/echo behave differently given 

    $ echo 'foo \1 bar'

some 'echo' suffer from the "\<n>" interpolation.  Some don't.

I think your spelled-out version downthread (except for stepping out
of BRE which would break your sed script, as you realized) would be
a much readable alternative.

Thanks.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json
  2019-06-25 17:08           ` Ramsay Jones
@ 2019-06-26 15:05             ` Johannes Schindelin
  0 siblings, 0 replies; 43+ messages in thread
From: Johannes Schindelin @ 2019-06-26 15:05 UTC (permalink / raw)
  To: Ramsay Jones
  Cc: Duy Nguyen, Git Mailing List, Junio C Hamano, Jeff King,
	Derrick Stolee, Ævar Arnfjörð Bjarmason

Hi Ramsay,

On Tue, 25 Jun 2019, Ramsay Jones wrote:

> On 25/06/2019 15:10, Johannes Schindelin wrote:
> > Hi Duy,
> [snip]
>
> >> Again our experiences differ. Mine is mostly about extensions,
> >> probably because I had to work on them more often. For normal entries
> >> "ls-files --debug" gives you 99% what's in the index file already.
> >
> > Like the device. And the ctime. And the file size. And the uid/gid. Is
> > that what you mean?
>
> Hmm, well I think so:
>
>   $ git ls-files --debug git.c git-compat-util.h
>   git-compat-util.h
>     ctime: 1561457278:502638001
>     mtime: 1561457278:502638001
>     dev: 2049	ino: 262663
>     uid: 1000	gid: 1000
>     size: 35440	flags: 0
>   git.c
>     ctime: 1561457278:518646000
>     mtime: 1561457278:518646000
>     dev: 2049	ino: 263083
>     uid: 1000	gid: 1000
>     size: 26837	flags: 0
>   $
>
> I have occasionally added stuff to the '--debug' output
> while debugging something, but the above is usually
> sufficient for my uses. (Having said that, I have not
> had the need to debug extensions [yet!]).

Well, live and learn. So for debugging purposes, we already have a good
facility and would not need JSON at all.

Good to know,
Dscho

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 01/10] ls-files: add --json to dump the index
  2019-06-24 13:02 ` [PATCH v2 01/10] ls-files: add --json to dump the index Nguyễn Thái Ngọc Duy
                     ` (2 preceding siblings ...)
  2019-06-25  9:44   ` Johannes Schindelin
@ 2019-06-26 19:51   ` Junio C Hamano
  3 siblings, 0 replies; 43+ messages in thread
From: Junio C Hamano @ 2019-06-26 19:51 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: git, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> This --json is supposed to help that. It dumps the index in a human

This and another one on the title need to become "--debug-json".

Are we expecting a reroll to introduce another layer above the most
primitive json writer that knows the schema used to represent both
system standard and our application-specific structures, or is the
current arrangement to have them in json-writer.c until there are
enough of them to warrant such a split good enough?

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 05/10] split-index.c: dump "link" extension as json
  2019-06-25 12:40       ` Derrick Stolee
@ 2019-06-27 10:48         ` Duy Nguyen
  2019-06-27 13:24           ` Jeff Hostetler
  0 siblings, 1 reply; 43+ messages in thread
From: Duy Nguyen @ 2019-06-27 10:48 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Jeff Hostetler, Git Mailing List, Junio C Hamano, Jeff King,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin

On Tue, Jun 25, 2019 at 7:40 PM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 6/25/2019 6:29 AM, Duy Nguyen wrote:
> > On Tue, Jun 25, 2019 at 3:06 AM Jeff Hostetler <git@jeffhostetler.com> wrote:
> >> I'm curious how big these EWAHs will be in practice and
> >> how useful an array of integers will be (especially as the
> >> pretty format will be one integer per line).  Perhaps it
> >> would helpful to have an extended example in one of the
> >> tests.
> >
> > It's one integer per updated entry. So if you have a giant index and
> > updated every single one of them, the EWAH bitmap contains that many
> > integers.
> >
> > If it was easy to just merge these bitmaps back to the entry (e.g. in
> > this example, add "replaced": true to entry zero) I would have done
> > it. But we dump as we stream and it's already too late to do it.
> >
> >> Would it be better to have the caller of ewah_each_bit()
> >> build a hex or bit string in a strbuf and then write it
> >> as a single string?
> >
> > I don't think the current EWAH representation is easy to read in the
> > first place. You'll probably have to run through some script to update
> > the main entries part and will have a much better view, but that's
> > pretty quick. If it's for scripts, then it's probably best to keep as
> > an array of integers, not a string. Less post processing.
>
> I don't think the intent is to dump the EWAH directly, but instead to
> dump a string of the uncompressed bitmap. Something like:
>
>         "delete_bitmap" : "01101101101"
>
> instead of
>
>         "delete_bitmap" : [ 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1 ]

I get this part. But the numbers in the array were the position of the
set bits. It's not showing just the actual bit map.

The same bitmap would be currently displayed as

 "delete_bitmap": [ 1, 2, 4, 5, 7, 8, 9, 11 ]

And that maps back to the entry[1], entry[2], entry[4]... in the index
being deleted from the base index. So displaying as a real bit map
actually adds more work for both the reader and the tool because you
have to calculate the position either way. And it gets harder if the
bit you're intereted in is on the far right.

> > Another reason for not merging to one string (might not be a very good
> > argument though) is to help diff between two indexes.
> > One-number-per-line works well with "git diff --no-index" while one
> > long string is a bit harder. I did this kind of comparison when I made
> > changes in read-cache.c and wanted to check if the new index file is
> > completely broken, or just slighly broken.
>
> You're right that the diff of the json output is an interesting
> use, and the "single string" output is not helpful. What about
> batches of 64-bit strings? For example:
>
>         "delete_bitmap" : [
>                 "0101010101010101010101010101010101010101010101010101010101010101",
>                 "0101010101010101010101010101010101010101010101010101010101010101",
>                 "0101010101010101010101010101010101010101010101010101010101010101",
>                 "01010101010101"
>         ]
>
> This could be a happy medium between the two options, but does require
> some extra work in the formatter.

And the reader/parser too since you have to join that array back in
one string first.
--
Duy

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 05/10] split-index.c: dump "link" extension as json
  2019-06-27 10:48         ` Duy Nguyen
@ 2019-06-27 13:24           ` Jeff Hostetler
  2019-06-27 13:42             ` Derrick Stolee
  0 siblings, 1 reply; 43+ messages in thread
From: Jeff Hostetler @ 2019-06-27 13:24 UTC (permalink / raw)
  To: Duy Nguyen, Derrick Stolee
  Cc: Git Mailing List, Junio C Hamano, Jeff King,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin



On 6/27/2019 6:48 AM, Duy Nguyen wrote:
> On Tue, Jun 25, 2019 at 7:40 PM Derrick Stolee <stolee@gmail.com> wrote:
>>
>> On 6/25/2019 6:29 AM, Duy Nguyen wrote:
>>> On Tue, Jun 25, 2019 at 3:06 AM Jeff Hostetler <git@jeffhostetler.com> wrote:
>>>> I'm curious how big these EWAHs will be in practice and
>>>> how useful an array of integers will be (especially as the
>>>> pretty format will be one integer per line).  Perhaps it
>>>> would helpful to have an extended example in one of the
>>>> tests.
>>>
>>> It's one integer per updated entry. So if you have a giant index and
>>> updated every single one of them, the EWAH bitmap contains that many
>>> integers.
>>>
>>> If it was easy to just merge these bitmaps back to the entry (e.g. in
>>> this example, add "replaced": true to entry zero) I would have done
>>> it. But we dump as we stream and it's already too late to do it.
>>>
>>>> Would it be better to have the caller of ewah_each_bit()
>>>> build a hex or bit string in a strbuf and then write it
>>>> as a single string?
>>>
>>> I don't think the current EWAH representation is easy to read in the
>>> first place. You'll probably have to run through some script to update
>>> the main entries part and will have a much better view, but that's
>>> pretty quick. If it's for scripts, then it's probably best to keep as
>>> an array of integers, not a string. Less post processing.
>>
>> I don't think the intent is to dump the EWAH directly, but instead to
>> dump a string of the uncompressed bitmap. Something like:
>>
>>          "delete_bitmap" : "01101101101"
>>
>> instead of
>>
>>          "delete_bitmap" : [ 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1 ]
> 
> I get this part. But the numbers in the array were the position of the
> set bits. It's not showing just the actual bit map.
> 
> The same bitmap would be currently displayed as
> 
>   "delete_bitmap": [ 1, 2, 4, 5, 7, 8, 9, 11 ]
> 
> And that maps back to the entry[1], entry[2], entry[4]... in the index
> being deleted from the base index. So displaying as a real bit map
> actually adds more work for both the reader and the tool because you
> have to calculate the position either way. And it gets harder if the
> bit you're intereted in is on the far right.


Thanks for the clarification.  That helps.

Jeff

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 05/10] split-index.c: dump "link" extension as json
  2019-06-27 13:24           ` Jeff Hostetler
@ 2019-06-27 13:42             ` Derrick Stolee
  2019-06-27 13:47               ` Duy Nguyen
  0 siblings, 1 reply; 43+ messages in thread
From: Derrick Stolee @ 2019-06-27 13:42 UTC (permalink / raw)
  To: Jeff Hostetler, Duy Nguyen
  Cc: Git Mailing List, Junio C Hamano, Jeff King,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin

On 6/27/2019 9:24 AM, Jeff Hostetler wrote:
> On 6/27/2019 6:48 AM, Duy Nguyen wrote:
>> On Tue, Jun 25, 2019 at 7:40 PM Derrick Stolee <stolee@gmail.com> wrote:
>>>
>>> On 6/25/2019 6:29 AM, Duy Nguyen wrote:
>>>> On Tue, Jun 25, 2019 at 3:06 AM Jeff Hostetler <git@jeffhostetler.com> wrote:
>>>>> I'm curious how big these EWAHs will be in practice and
>>>>> how useful an array of integers will be (especially as the
>>>>> pretty format will be one integer per line).  Perhaps it
>>>>> would helpful to have an extended example in one of the
>>>>> tests.
>>>>
>>>> It's one integer per updated entry. So if you have a giant index and
>>>> updated every single one of them, the EWAH bitmap contains that many
>>>> integers.
>>>>
>>>> If it was easy to just merge these bitmaps back to the entry (e.g. in
>>>> this example, add "replaced": true to entry zero) I would have done
>>>> it. But we dump as we stream and it's already too late to do it.
>>>>
>>>>> Would it be better to have the caller of ewah_each_bit()
>>>>> build a hex or bit string in a strbuf and then write it
>>>>> as a single string?
>>>>
>>>> I don't think the current EWAH representation is easy to read in the
>>>> first place. You'll probably have to run through some script to update
>>>> the main entries part and will have a much better view, but that's
>>>> pretty quick. If it's for scripts, then it's probably best to keep as
>>>> an array of integers, not a string. Less post processing.
>>>
>>> I don't think the intent is to dump the EWAH directly, but instead to
>>> dump a string of the uncompressed bitmap. Something like:
>>>
>>>          "delete_bitmap" : "01101101101"
>>>
>>> instead of
>>>
>>>          "delete_bitmap" : [ 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1 ]
>>
>> I get this part. But the numbers in the array were the position of the
>> set bits. It's not showing just the actual bit map.
>>
>> The same bitmap would be currently displayed as
>>
>>   "delete_bitmap": [ 1, 2, 4, 5, 7, 8, 9, 11 ]
>>
>> And that maps back to the entry[1], entry[2], entry[4]... in the index
>> being deleted from the base index. So displaying as a real bit map
>> actually adds more work for both the reader and the tool because you
>> have to calculate the position either way. And it gets harder if the
>> bit you're intereted in is on the far right.
> 
> 
> Thanks for the clarification.  That helps.

Same here! We expect these to be much smaller than the full set, correct?

Thanks,
-Stolee


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 05/10] split-index.c: dump "link" extension as json
  2019-06-27 13:42             ` Derrick Stolee
@ 2019-06-27 13:47               ` Duy Nguyen
  0 siblings, 0 replies; 43+ messages in thread
From: Duy Nguyen @ 2019-06-27 13:47 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Jeff Hostetler, Git Mailing List, Junio C Hamano, Jeff King,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin

On Thu, Jun 27, 2019 at 8:42 PM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 6/27/2019 9:24 AM, Jeff Hostetler wrote:
> > On 6/27/2019 6:48 AM, Duy Nguyen wrote:
> >> On Tue, Jun 25, 2019 at 7:40 PM Derrick Stolee <stolee@gmail.com> wrote:
> >>>
> >>> On 6/25/2019 6:29 AM, Duy Nguyen wrote:
> >>>> On Tue, Jun 25, 2019 at 3:06 AM Jeff Hostetler <git@jeffhostetler.com> wrote:
> >>>>> I'm curious how big these EWAHs will be in practice and
> >>>>> how useful an array of integers will be (especially as the
> >>>>> pretty format will be one integer per line).  Perhaps it
> >>>>> would helpful to have an extended example in one of the
> >>>>> tests.
> >>>>
> >>>> It's one integer per updated entry. So if you have a giant index and
> >>>> updated every single one of them, the EWAH bitmap contains that many
> >>>> integers.
> >>>>
> >>>> If it was easy to just merge these bitmaps back to the entry (e.g. in
> >>>> this example, add "replaced": true to entry zero) I would have done
> >>>> it. But we dump as we stream and it's already too late to do it.
> >>>>
> >>>>> Would it be better to have the caller of ewah_each_bit()
> >>>>> build a hex or bit string in a strbuf and then write it
> >>>>> as a single string?
> >>>>
> >>>> I don't think the current EWAH representation is easy to read in the
> >>>> first place. You'll probably have to run through some script to update
> >>>> the main entries part and will have a much better view, but that's
> >>>> pretty quick. If it's for scripts, then it's probably best to keep as
> >>>> an array of integers, not a string. Less post processing.
> >>>
> >>> I don't think the intent is to dump the EWAH directly, but instead to
> >>> dump a string of the uncompressed bitmap. Something like:
> >>>
> >>>          "delete_bitmap" : "01101101101"
> >>>
> >>> instead of
> >>>
> >>>          "delete_bitmap" : [ 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1 ]
> >>
> >> I get this part. But the numbers in the array were the position of the
> >> set bits. It's not showing just the actual bit map.
> >>
> >> The same bitmap would be currently displayed as
> >>
> >>   "delete_bitmap": [ 1, 2, 4, 5, 7, 8, 9, 11 ]
> >>
> >> And that maps back to the entry[1], entry[2], entry[4]... in the index
> >> being deleted from the base index. So displaying as a real bit map
> >> actually adds more work for both the reader and the tool because you
> >> have to calculate the position either way. And it gets harder if the
> >> bit you're intereted in is on the far right.
> >
> >
> > Thanks for the clarification.  That helps.
>
> Same here! We expect these to be much smaller than the full set, correct?

For split-index, the number of 1 bits should be about the size of your
working set, not the index size. In the normal case, then yes it
should be much smaller. After a big merge or branch switch, it could
get as big as the index. But I would hope the logic to re-split the
index kicks in, which essentially empties these bitmaps.

EWAH bitmap is also used in UNTR extension if I remember correctly.
Those bitmaps may have as many bits as the directories you have in the
index.

> Thanks,
> -Stolee
>


-- 
Duy

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 05/10] split-index.c: dump "link" extension as json
  2019-06-24 13:02 ` [PATCH v2 05/10] split-index.c: dump "link" " Nguyễn Thái Ngọc Duy
  2019-06-24 20:06   ` Jeff Hostetler
@ 2019-07-03  9:08   ` SZEDER Gábor
  2019-07-04 20:01   ` SZEDER Gábor
  2 siblings, 0 replies; 43+ messages in thread
From: SZEDER Gábor @ 2019-07-03  9:08 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: git, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin

On Mon, Jun 24, 2019 at 08:02:21PM +0700, Nguyễn Thái Ngọc Duy wrote:
> diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
> index 082fe8e966..dbb572ce9d 100755
> --- a/t/t3011-ls-files-json.sh
> +++ b/t/t3011-ls-files-json.sh
> @@ -44,4 +44,18 @@ test_expect_success 'ls-files --json, main entries, UNTR and TREE' '
>  	compare_json basic
>  '
>  
> +test_expect_success 'ls-files --json, split index' '
> +	git init split &&
> +	(
> +		cd split &&
> +		echo one >one &&
> +		git add one &&
> +		git update-index --split-index &&
> +		echo updated >>one &&
> +		test_must_fail git -c splitIndex.maxPercentChange=100 update-index --refresh &&
> +		cp ../filter.sed . &&
> +		compare_json split-index
> +	)
> +'

I think this test should 'sane_unset GIT_TEST_SPLIT_INDEX'.  Maybe
it's not absolutely necessary, because the explicit '--split-index'
and '-c splitIndex.maxPercentChange=100' would already fully control
when index splitting is performed, eliminating any indeterminism
inherent to GIT_TEST_SPLIT_INDEX...  but unsetting it would reduce the
cognitive load on future readers.

The same might apply to GIT_TEST_FSMONITOR in the following patch, and
perhaps even to GIT_TEST_INDEX_THREADS.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 05/10] split-index.c: dump "link" extension as json
  2019-06-24 13:02 ` [PATCH v2 05/10] split-index.c: dump "link" " Nguyễn Thái Ngọc Duy
  2019-06-24 20:06   ` Jeff Hostetler
  2019-07-03  9:08   ` SZEDER Gábor
@ 2019-07-04 20:01   ` SZEDER Gábor
  2019-07-04 23:54     ` Duy Nguyen
  2 siblings, 1 reply; 43+ messages in thread
From: SZEDER Gábor @ 2019-07-04 20:01 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: git, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin

On Mon, Jun 24, 2019 at 08:02:21PM +0700, Nguyễn Thái Ngọc Duy wrote:
> diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
> index 082fe8e966..dbb572ce9d 100755
> --- a/t/t3011-ls-files-json.sh
> +++ b/t/t3011-ls-files-json.sh
> @@ -44,4 +44,18 @@ test_expect_success 'ls-files --json, main entries, UNTR and TREE' '
>  	compare_json basic
>  '
>  
> +test_expect_success 'ls-files --json, split index' '
> +	git init split &&
> +	(
> +		cd split &&
> +		echo one >one &&
> +		git add one &&
> +		git update-index --split-index &&
> +		echo updated >>one &&
> +		test_must_fail git -c splitIndex.maxPercentChange=100 update-index --refresh &&
> +		cp ../filter.sed . &&
> +		compare_json split-index
> +	)
> +'
> +
>  test_done
> diff --git a/t/t3011/split-index b/t/t3011/split-index
> new file mode 100644
> index 0000000000..cdcc4ddded
> --- /dev/null
> +++ b/t/t3011/split-index
> @@ -0,0 +1,39 @@
> +{
> +  "version": 2,
> +  "oid": <string>,
> +  "mtime_sec": <number>,
> +  "mtime_nsec": <number>,
> +  "entries": [
> +    {
> +      "id": 0,
> +      "name": "",
> +      "mode": "100644",
> +      "flags": 0,
> +      "oid": <string>,
> +      "stat": {
> +        "ctime_sec": <number>,
> +        "ctime_nsec": <number>,
> +        "mtime_sec": <number>,
> +        "mtime_nsec": <number>,
> +        "device": <number>,
> +        "inode": <number>,
> +        "uid": <number>,
> +        "gid": <number>,
> +        "size": 4
> +      },
> +      "file_offset": <number>
> +    }
> +  ],
> +  "extensions": {
> +    "link": {
> +      "file_offset": <number>,
> +      "ext_size": <number>,
> +      "oid": <string>,
> +      "delete_bitmap": [
> +      ],
> +      "replace_bitmap": [
> +        0
> +      ]
> +    }
> +  }
> +}

This test is flaky, as reported in:

  https://public-inbox.org/git/xmqqftno2mku.fsf@gitster-ct.c.googlers.com/

This is because it relies on racy behaviour, namely that the following
three commands

    echo one >one &&
    git add one &&
    git update-index --split-index &&

are executed within the same second, leaving 'one' racily clean.  To
deal with the racily clean file, 5581a019ba (split-index: smudge and
add racily clean cache entries to split index, 2018-10-11) kicks in,
and 'one's smudged index entry is stored both in the shared index and
in the split index.  That's why this test expects the offset 0 in the
"replace_bitmap" array.

However, it's possible that a second boundary is crossed between
writing to 'one' and splitting the index, and then 'one' is not racily
clean, and its index entry is only stored in the shared index.
Consequently, there are no index entries in the split index, so the
"replace_bitmap" array ends up being empty, ultimately failing the
test.

A 'test-tool chmtime' invocation or two could make the test
deterministic (i.e it could make sure that 'one' is either always
racily clean or it never is, whichever is preferred).

What I still don't understand, however, is that when the test fails
this way, then the "entries" array ends up being empty as well.  It
looks as if the JSON dump included only index entries that were
actually stored in '.git/index', but omitted entries that were only
present in the shared index.  I think this is wrong, and it should
dump the unified view of the split and shared indexes.  Or include all
entries from the shared index as well.  Or perhaps I'm completely
missing something...

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 05/10] split-index.c: dump "link" extension as json
  2019-07-04 20:01   ` SZEDER Gábor
@ 2019-07-04 23:54     ` Duy Nguyen
  2019-07-08 17:58       ` Junio C Hamano
  0 siblings, 1 reply; 43+ messages in thread
From: Duy Nguyen @ 2019-07-04 23:54 UTC (permalink / raw)
  To: SZEDER Gábor
  Cc: Git Mailing List, Junio C Hamano, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin

On Fri, Jul 5, 2019 at 3:01 AM SZEDER Gábor <szeder.dev@gmail.com> wrote:
>
> On Mon, Jun 24, 2019 at 08:02:21PM +0700, Nguyễn Thái Ngọc Duy wrote:
> > diff --git a/t/t3011-ls-files-json.sh b/t/t3011-ls-files-json.sh
> > index 082fe8e966..dbb572ce9d 100755
> > --- a/t/t3011-ls-files-json.sh
> > +++ b/t/t3011-ls-files-json.sh
> > @@ -44,4 +44,18 @@ test_expect_success 'ls-files --json, main entries, UNTR and TREE' '
> >       compare_json basic
> >  '
> >
> > +test_expect_success 'ls-files --json, split index' '
> > +     git init split &&
> > +     (
> > +             cd split &&
> > +             echo one >one &&
> > +             git add one &&
> > +             git update-index --split-index &&
> > +             echo updated >>one &&
> > +             test_must_fail git -c splitIndex.maxPercentChange=100 update-index --refresh &&
> > +             cp ../filter.sed . &&
> > +             compare_json split-index
> > +     )
> > +'
> > +
> >  test_done
> > diff --git a/t/t3011/split-index b/t/t3011/split-index
> > new file mode 100644
> > index 0000000000..cdcc4ddded
> > --- /dev/null
> > +++ b/t/t3011/split-index
> > @@ -0,0 +1,39 @@
> > +{
> > +  "version": 2,
> > +  "oid": <string>,
> > +  "mtime_sec": <number>,
> > +  "mtime_nsec": <number>,
> > +  "entries": [
> > +    {
> > +      "id": 0,
> > +      "name": "",
> > +      "mode": "100644",
> > +      "flags": 0,
> > +      "oid": <string>,
> > +      "stat": {
> > +        "ctime_sec": <number>,
> > +        "ctime_nsec": <number>,
> > +        "mtime_sec": <number>,
> > +        "mtime_nsec": <number>,
> > +        "device": <number>,
> > +        "inode": <number>,
> > +        "uid": <number>,
> > +        "gid": <number>,
> > +        "size": 4
> > +      },
> > +      "file_offset": <number>
> > +    }
> > +  ],
> > +  "extensions": {
> > +    "link": {
> > +      "file_offset": <number>,
> > +      "ext_size": <number>,
> > +      "oid": <string>,
> > +      "delete_bitmap": [
> > +      ],
> > +      "replace_bitmap": [
> > +        0
> > +      ]
> > +    }
> > +  }
> > +}
>
> This test is flaky, as reported in:
>
>   https://public-inbox.org/git/xmqqftno2mku.fsf@gitster-ct.c.googlers.com/
>
> This is because it relies on racy behaviour, namely that the following
> three commands
>
>     echo one >one &&
>     git add one &&
>     git update-index --split-index &&
>
> are executed within the same second, leaving 'one' racily clean.  To
> deal with the racily clean file, 5581a019ba (split-index: smudge and
> add racily clean cache entries to split index, 2018-10-11) kicks in,
> and 'one's smudged index entry is stored both in the shared index and
> in the split index.  That's why this test expects the offset 0 in the
> "replace_bitmap" array.
>
> However, it's possible that a second boundary is crossed between
> writing to 'one' and splitting the index, and then 'one' is not racily
> clean, and its index entry is only stored in the shared index.
> Consequently, there are no index entries in the split index, so the
> "replace_bitmap" array ends up being empty, ultimately failing the
> test.

Yep. I came up with the same conclusion. But I still have a couple
other things to update before resending.

> A 'test-tool chmtime' invocation or two could make the test
> deterministic (i.e it could make sure that 'one' is either always
> racily clean or it never is, whichever is preferred).
>
> What I still don't understand, however, is that when the test fails
> this way, then the "entries" array ends up being empty as well.  It
> looks as if the JSON dump included only index entries that were
> actually stored in '.git/index', but omitted entries that were only
> present in the shared index.  I think this is wrong, and it should
> dump the unified view of the split and shared indexes.  Or include all
> entries from the shared index as well.  Or perhaps I'm completely
> missing something...

The command is to dump .git/index, not the shared one. And since this
is not a split index test, rather a (quite low-level) json dump test,
I did not bother to also dump the shared index, which should look like
a regular one. Producing a unified view in json might not be easy with
the current code because it's tied to file reading code, nearly stream
out json as we read the file, and split-index requires a post
processing step. I could contribute a python script or something to
combine shared/main index together. That way you can still see the
combined one, but we don't have to add/maintain more C code.
-- 
Duy

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 05/10] split-index.c: dump "link" extension as json
  2019-07-04 23:54     ` Duy Nguyen
@ 2019-07-08 17:58       ` Junio C Hamano
  0 siblings, 0 replies; 43+ messages in thread
From: Junio C Hamano @ 2019-07-08 17:58 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: SZEDER Gábor, Git Mailing List, Jeff King, Derrick Stolee,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin

Duy Nguyen <pclouds@gmail.com> writes:

> The command is to dump .git/index, not the shared one. And since this
> is not a split index test, rather a (quite low-level) json dump test,
> I did not bother to also dump the shared index, which should look like
> a regular one. Producing a unified view in json might not be easy with
> the current code because it's tied to file reading code, nearly stream
> out json as we read the file, and split-index requires a post
> processing step. I could contribute a python script or something to
> combine shared/main index together. That way you can still see the
> combined one, but we don't have to add/maintain more C code.

Well, such a post-processing is something external scripts shine at
and exporting the internal data in json format is exactly to support
these scripts, so it may make a good first test case ;-)

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2019-07-08 17:58 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-24 13:02 [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Nguyễn Thái Ngọc Duy
2019-06-24 13:02 ` [PATCH v2 01/10] ls-files: add --json to dump the index Nguyễn Thái Ngọc Duy
2019-06-24 19:15   ` Jeff Hostetler
2019-06-24 20:04     ` Junio C Hamano
2019-06-25  9:21     ` Johannes Schindelin
2019-06-25  9:52     ` Duy Nguyen
2019-06-25 15:37       ` Jeff Hostetler
2019-06-25  9:05   ` Thomas Gummerer
2019-06-25  9:44   ` Johannes Schindelin
2019-06-25 11:31     ` Johannes Schindelin
2019-06-25 13:57       ` Johannes Schindelin
2019-06-25 22:28     ` Junio C Hamano
2019-06-26 19:51   ` Junio C Hamano
2019-06-24 13:02 ` [PATCH v2 02/10] read-cache.c: dump common extension info in json Nguyễn Thái Ngọc Duy
2019-06-24 13:02 ` [PATCH v2 03/10] cache-tree.c: dump "TREE" extension as json Nguyễn Thái Ngọc Duy
2019-06-24 13:02 ` [PATCH v2 04/10] dir.c: dump "UNTR" " Nguyễn Thái Ngọc Duy
2019-06-24 19:32   ` Jeff Hostetler
2019-06-24 13:02 ` [PATCH v2 05/10] split-index.c: dump "link" " Nguyễn Thái Ngọc Duy
2019-06-24 20:06   ` Jeff Hostetler
2019-06-25 10:29     ` Duy Nguyen
2019-06-25 12:40       ` Derrick Stolee
2019-06-27 10:48         ` Duy Nguyen
2019-06-27 13:24           ` Jeff Hostetler
2019-06-27 13:42             ` Derrick Stolee
2019-06-27 13:47               ` Duy Nguyen
2019-07-03  9:08   ` SZEDER Gábor
2019-07-04 20:01   ` SZEDER Gábor
2019-07-04 23:54     ` Duy Nguyen
2019-07-08 17:58       ` Junio C Hamano
2019-06-24 13:02 ` [PATCH v2 06/10] fsmonitor.c: dump "FSMN" " Nguyễn Thái Ngọc Duy
2019-06-24 13:02 ` [PATCH v2 07/10] resolve-undo.c: dump "REUC" " Nguyễn Thái Ngọc Duy
2019-06-24 13:02 ` [PATCH v2 08/10] read-cache.c: dump "EOIE" " Nguyễn Thái Ngọc Duy
2019-06-24 13:02 ` [PATCH v2 09/10] read-cache.c: dump "IEOT" " Nguyễn Thái Ngọc Duy
2019-06-24 13:02 ` [PATCH v2 10/10] t3008: use the new SINGLE_CPU prereq Nguyễn Thái Ngọc Duy
2019-06-24 18:00 ` [PATCH v2 00/10] Add 'ls-files --debug-json' to dump the index in json Johannes Schindelin
2019-06-24 18:39   ` Jeff Hostetler
2019-06-25  9:05   ` Duy Nguyen
2019-06-25  9:38     ` Thomas Gummerer
2019-06-25 11:27     ` Johannes Schindelin
2019-06-25 12:06       ` Duy Nguyen
2019-06-25 14:10         ` Johannes Schindelin
2019-06-25 17:08           ` Ramsay Jones
2019-06-26 15:05             ` Johannes Schindelin

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).