git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects
@ 2017-09-22 20:30 Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 06/13] list-objects-filter-sparse: add sparse-checkout based filter Jeff Hostetler
                   ` (7 more replies)
  0 siblings, 8 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Create a filter for traverse_commit_list_worker() to omit blobs
larger than a requested size from the result, but always include
".git*" special files.

Signed-off-by: Jeff Hostetler <jeffhost>@microsoft.com>
---
 Makefile                    |   1 +
 list-objects-filter-large.c | 108 ++++++++++++++++++++++++++++++++++++++++++++
 list-objects-filter-large.h |  18 ++++++++
 3 files changed, 127 insertions(+)
 create mode 100644 list-objects-filter-large.c
 create mode 100644 list-objects-filter-large.h

diff --git a/Makefile b/Makefile
index b98e3dc..f1f3979 100644
--- a/Makefile
+++ b/Makefile
@@ -799,6 +799,7 @@ LIB_OBJS += line-log.o
 LIB_OBJS += line-range.o
 LIB_OBJS += list-objects.o
 LIB_OBJS += list-objects-filter-all.o
+LIB_OBJS += list-objects-filter-large.o
 LIB_OBJS += ll-merge.o
 LIB_OBJS += lockfile.o
 LIB_OBJS += log-tree.o
diff --git a/list-objects-filter-large.c b/list-objects-filter-large.c
new file mode 100644
index 0000000..1af39b6
--- /dev/null
+++ b/list-objects-filter-large.c
@@ -0,0 +1,108 @@
+#include "cache.h"
+#include "dir.h"
+#include "tag.h"
+#include "commit.h"
+#include "tree.h"
+#include "blob.h"
+#include "diff.h"
+#include "tree-walk.h"
+#include "revision.h"
+#include "list-objects.h"
+#include "list-objects-filter-large.h"
+
+/*
+ * A filter for list-objects to omit large blobs,
+ * but always include ".git*" special files.
+ */
+struct filter_omit_large_blobs_data {
+	struct oidset2 omits;
+	int64_t max_bytes;
+};
+
+static list_objects_filter_result filter_omit_large_blobs(
+	list_objects_filter_type filter_type,
+	struct object *obj,
+	const char *pathname,
+	const char *filename,
+	void *filter_data_)
+{
+	struct filter_omit_large_blobs_data *filter_data = filter_data_;
+	int64_t object_length = -1;
+	unsigned long s;
+	enum object_type t;
+
+	switch (filter_type) {
+	default:
+		die("unkown filter_type");
+		return LOFR_ZERO;
+
+	case LOFT_BEGIN_TREE:
+		assert(obj->type == OBJ_TREE);
+		/* always include all tree objects */
+		return LOFR_MARK_SEEN | LOFR_SHOW;
+
+	case LOFT_END_TREE:
+		assert(obj->type == OBJ_TREE);
+		return LOFR_ZERO;
+
+	case LOFT_BLOB:
+		assert(obj->type == OBJ_BLOB);
+		assert((obj->flags & SEEN) == 0);
+
+		/*
+		 * If previously provisionally omitted (because of size), see if the
+		 * current filename is special and force it to be included.
+		 */
+		if (oidset2_contains(&filter_data->omits, &obj->oid)) {
+			if ((strncmp(filename, ".git", 4) == 0) && filename[4]) {
+				oidset2_remove(&filter_data->omits, &obj->oid);
+				return LOFR_MARK_SEEN | LOFR_SHOW;
+			}
+			return LOFR_ZERO; /* continue provisionally omitting it */
+		}
+
+		t = sha1_object_info(obj->oid.hash, &s);
+		assert(t == OBJ_BLOB);
+		object_length = (int64_t)((uint64_t)(s));
+
+		if (object_length < filter_data->max_bytes)
+			return LOFR_MARK_SEEN | LOFR_SHOW;
+
+		/*
+		 * Provisionally omit it.  We've already established that this blob
+		 * is too big and doesn't have a special filename, so we WANT to
+		 * omit it.  However, there may be a special file elsewhere in the
+		 * tree that references this same blob, so we cannot reject it yet.
+		 * Leave the LOFR_ bits unset so that if the blob appears again in
+		 * the traversal, we will be asked again.
+		 *
+		 * No need for a pathname, since we only test for special filenames
+		 * above.
+		 */
+		oidset2_insert(&filter_data->omits, &obj->oid, t, object_length,
+			       NULL);
+		return LOFR_ZERO;
+	}
+}
+
+void traverse_commit_list_omit_large_blobs(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *ctx_data,
+	int64_t large_byte_limit)
+{
+	struct filter_omit_large_blobs_data d;
+
+	memset(&d, 0, sizeof(d));
+	d.max_bytes = large_byte_limit;
+
+	traverse_commit_list_worker(revs, show_commit, show_object, ctx_data,
+				    filter_omit_large_blobs, &d);
+
+	if (print_omitted_object)
+		oidset2_foreach(&d.omits, print_omitted_object, ctx_data);
+
+	oidset2_clear(&d.omits);
+}
diff --git a/list-objects-filter-large.h b/list-objects-filter-large.h
new file mode 100644
index 0000000..4a5c772
--- /dev/null
+++ b/list-objects-filter-large.h
@@ -0,0 +1,18 @@
+#ifndef LIST_OBJECTS_FILTER_LARGE_H
+#define LIST_OBJECTS_FILTER_LARGE_H
+
+#include "oidset2.h"
+
+/*
+ * A filter for list-objects to omit large blobs,
+ * but always include ".git*" special files.
+ */
+void traverse_commit_list_omit_large_blobs(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *ctx_data,
+	int64_t large_byte_limit);
+
+#endif /* LIST_OBJECTS_FILTER_LARGE_H */
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 06/13] list-objects-filter-sparse: add sparse-checkout based filter
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 07/13] object-filter: common declarations for object filtering Jeff Hostetler
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Create a filter for traverse_commit_list_worker() to only include
the blobs the would be referenced by a sparse-checkout using the
given specification.

A future enhancement should be able to also omit unneeded tree
objects, but that is not currently supported.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 Makefile                     |   1 +
 list-objects-filter-sparse.c | 221 +++++++++++++++++++++++++++++++++++++++++++
 list-objects-filter-sparse.h |  30 ++++++
 3 files changed, 252 insertions(+)
 create mode 100644 list-objects-filter-sparse.c
 create mode 100644 list-objects-filter-sparse.h

diff --git a/Makefile b/Makefile
index f1f3979..6e0bd91 100644
--- a/Makefile
+++ b/Makefile
@@ -800,6 +800,7 @@ LIB_OBJS += line-range.o
 LIB_OBJS += list-objects.o
 LIB_OBJS += list-objects-filter-all.o
 LIB_OBJS += list-objects-filter-large.o
+LIB_OBJS += list-objects-filter-sparse.o
 LIB_OBJS += ll-merge.o
 LIB_OBJS += lockfile.o
 LIB_OBJS += log-tree.o
diff --git a/list-objects-filter-sparse.c b/list-objects-filter-sparse.c
new file mode 100644
index 0000000..59155d5
--- /dev/null
+++ b/list-objects-filter-sparse.c
@@ -0,0 +1,221 @@
+#include "cache.h"
+#include "dir.h"
+#include "tag.h"
+#include "commit.h"
+#include "tree.h"
+#include "blob.h"
+#include "diff.h"
+#include "tree-walk.h"
+#include "revision.h"
+#include "list-objects.h"
+#include "list-objects-filter-sparse.h"
+
+/*
+ * A filter driven by a sparse-checkout specification to only
+ * include blobs that a sparse checkout would populate.
+ *
+ * The sparse-checkout spec can be loaded from a blob with the
+ * given OID or from a local pathname.  We allow an OID because
+ * the repo may be bare or we may be doing the filtering on the
+ * server.
+ */
+struct frame {
+	int defval;
+	int child_prov_omit : 1;
+};
+
+struct filter_use_sparse_data {
+	struct oidset2 omits;
+	struct exclude_list el;
+
+	size_t nr, alloc;
+	struct frame *array_frame;
+};
+
+static list_objects_filter_result filter_use_sparse(
+	list_objects_filter_type filter_type,
+	struct object *obj,
+	const char *pathname,
+	const char *filename,
+	void *filter_data_)
+{
+	struct filter_use_sparse_data *filter_data = filter_data_;
+	int64_t object_length = -1;
+	int val, dtype;
+	unsigned long s;
+	enum object_type t;
+	struct frame *frame;
+
+	switch (filter_type) {
+	default:
+		die("unkown filter_type");
+		return LOFR_ZERO;
+
+	case LOFT_BEGIN_TREE:
+		assert(obj->type == OBJ_TREE);
+		dtype = DT_DIR;
+		val = is_excluded_from_list(pathname, strlen(pathname),
+					    filename, &dtype, &filter_data->el,
+					    &the_index);
+		if (val < 0)
+			val = filter_data->array_frame[filter_data->nr].defval;
+
+		ALLOC_GROW(filter_data->array_frame, filter_data->nr + 1,
+			   filter_data->alloc);
+		filter_data->nr++;
+		filter_data->array_frame[filter_data->nr].defval = val;
+		filter_data->array_frame[filter_data->nr].child_prov_omit = 0;
+
+		/*
+		 * A directory with this tree OID may appear in multiple
+		 * places in the tree. (Think of a directory move, with
+		 * no other changes.)  And with a different pathname, the
+		 * is_excluded...() results for this directory and items
+		 * contained within it may be different.  So we cannot
+		 * mark it SEEN (yet), since that will prevent process_tree()
+		 * from revisiting this tree object with other pathnames.
+		 *
+		 * Only SHOW the tree object the first time we visit this
+		 * tree object.
+		 *
+		 * We always show all tree objects.  A future optimization
+		 * may want to attempt to narrow this.
+		 */
+		if (obj->flags & FILTER_REVISIT)
+			return LOFR_ZERO;
+		obj->flags |= FILTER_REVISIT;
+		return LOFR_SHOW;
+
+	case LOFT_END_TREE:
+		assert(obj->type == OBJ_TREE);
+		assert(filter_data->nr > 0);
+
+		frame = &filter_data->array_frame[filter_data->nr];
+		filter_data->nr--;
+
+		/*
+		 * Tell our parent directory if any of our children were
+		 * provisionally omitted.
+		 */
+		filter_data->array_frame[filter_data->nr].child_prov_omit |=
+			frame->child_prov_omit;
+
+		/*
+		 * If there are NO provisionally omitted child objects (ALL child
+		 * objects in this folder were INCLUDED), then we can mark the
+		 * folder as SEEN (so we will not have to revisit it again).
+		 */
+		if (!frame->child_prov_omit)
+			return LOFR_MARK_SEEN;
+		return LOFR_ZERO;
+
+	case LOFT_BLOB:
+		assert(obj->type == OBJ_BLOB);
+		assert((obj->flags & SEEN) == 0);
+
+		frame = &filter_data->array_frame[filter_data->nr];
+
+		/*
+		 * If we previously provisionally omitted this blob because
+		 * its pathname was not in the sparse-checkout AND this
+		 * reference to the blob has the same pathname, we can avoid
+		 * repeating the exclusion logic on this pathname and just
+		 * continue to provisionally omit it.
+		 */
+		if (obj->flags & FILTER_REVISIT) {
+			struct oidset2_entry *entry_prev;
+			entry_prev = oidset2_get(&filter_data->omits, &obj->oid);
+			if (entry_prev && !strcmp(pathname, entry_prev->pathname)) {
+				frame->child_prov_omit = 1;
+				return LOFR_ZERO;
+			}
+		}
+
+		dtype = DT_REG;
+		val = is_excluded_from_list(pathname, strlen(pathname),
+					    filename, &dtype, &filter_data->el,
+					    &the_index);
+		if (val < 0)
+			val = frame->defval;
+		if (val > 0)
+			return LOFR_MARK_SEEN | LOFR_SHOW;
+
+		t = sha1_object_info(obj->oid.hash, &s);
+		assert(t == OBJ_BLOB);
+		object_length = (int64_t)((uint64_t)(s));
+
+		/*
+		 * Provisionally omit it.  We've already established that
+		 * this pathname is not in the sparse-checkout specification,
+		 * so we WANT to omit this blob.  However, a pathname elsewhere
+		 * in the tree may also reference this same blob, so we cannot
+		 * reject it yet.  Leave the LOFR_ bits unset so that if the
+		 * blob appears again in the traversal, we will be asked again.
+		 *
+		 * The pathname we associate with this omit is just the first
+		 * one we saw for this blob.  Other instances of this blob may
+		 * have other pathnames and that is fine.  We just use it for
+		 * perf because most of the time, the blob will be in the same
+		 * place as we walk the commits.
+		 */
+		oidset2_insert(&filter_data->omits, &obj->oid, t, object_length,
+			       pathname);
+		obj->flags |= FILTER_REVISIT;
+		frame->child_prov_omit = 1;
+		return LOFR_ZERO;
+	}
+}
+
+void traverse_commit_list_use_blob(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *ctx_data,
+	struct object_id *oid)
+{
+	struct filter_use_sparse_data d;
+
+	memset(&d, 0, sizeof(d));
+	if (add_excludes_from_blob_to_list(oid, NULL, 0, &d.el) < 0)
+		die("could not load filter specification");
+
+	ALLOC_GROW(d.array_frame, d.nr + 1, d.alloc);
+	d.array_frame[d.nr].defval = 0; /* default to include */
+	d.array_frame[d.nr].child_prov_omit = 0;
+
+	traverse_commit_list_worker(revs, show_commit, show_object, ctx_data,
+				    filter_use_sparse, &d);
+
+	if (print_omitted_object)
+		oidset2_foreach(&d.omits, print_omitted_object, ctx_data);
+
+	oidset2_clear(&d.omits);
+}
+
+void traverse_commit_list_use_path(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *ctx_data,
+	const char *path)
+{
+	struct filter_use_sparse_data d;
+
+	memset(&d, 0, sizeof(d));
+	if (add_excludes_from_file_to_list(path, NULL, 0, &d.el, NULL) < 0)
+		die("could not load filter specification");
+
+	ALLOC_GROW(d.array_frame, d.nr + 1, d.alloc);
+	d.array_frame[d.nr].defval = 0; /* default to include */
+	d.array_frame[d.nr].child_prov_omit = 0;
+
+	traverse_commit_list_worker(revs, show_commit, show_object, ctx_data,
+				    filter_use_sparse, &d);
+
+	if (print_omitted_object)
+		oidset2_foreach(&d.omits, print_omitted_object, ctx_data);
+
+	oidset2_clear(&d.omits);
+}
diff --git a/list-objects-filter-sparse.h b/list-objects-filter-sparse.h
new file mode 100644
index 0000000..aa89390
--- /dev/null
+++ b/list-objects-filter-sparse.h
@@ -0,0 +1,30 @@
+#ifndef LIST_OBJECTS_FILTERS_SPARSE_H
+#define LIST_OBJECTS_FILTERS_SPARSE_H
+
+#include "oidset2.h"
+
+/*
+ * A filter driven by a sparse-checkout specification to only
+ * include blobs that a sparse checkout would populate.
+ *
+ * The sparse-checkout spec can be loaded from a blob with the
+ * given OID, a blob with a blob-ish path, or from a local pathname.
+ * We allow an OID because the repo may be bare or we may be doing
+ * the filtering on the server.
+ */
+void traverse_commit_list_use_blob(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *ctx_data,
+	struct object_id *oid);
+void traverse_commit_list_use_path(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *ctx_data,
+	const char *path);
+
+#endif /* LIST_OBJECTS_FILTERS_SPARSE_H */
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 07/13] object-filter: common declarations for object filtering
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 06/13] list-objects-filter-sparse: add sparse-checkout based filter Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  2017-09-26 22:39   ` Jonathan Tan
  2017-09-22 20:30 ` [PATCH 08/13] list-objects: add traverse_commit_list_filtered method Jeff Hostetler
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Create common routines and defines for parsing
object-filter-related command line arguments and
pack-protocol fields.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 Makefile        |   1 +
 object-filter.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 object-filter.h | 173 ++++++++++++++++++++++++++++++++++++
 3 files changed, 443 insertions(+)
 create mode 100644 object-filter.c
 create mode 100644 object-filter.h

diff --git a/Makefile b/Makefile
index 6e0bd91..bddd6b5 100644
--- a/Makefile
+++ b/Makefile
@@ -818,6 +818,7 @@ LIB_OBJS += notes-cache.o
 LIB_OBJS += notes-merge.o
 LIB_OBJS += notes-utils.o
 LIB_OBJS += object.o
+LIB_OBJS += object-filter.o
 LIB_OBJS += oidset.o
 LIB_OBJS += oidset2.o
 LIB_OBJS += pack-bitmap.o
diff --git a/object-filter.c b/object-filter.c
new file mode 100644
index 0000000..c88f79f
--- /dev/null
+++ b/object-filter.c
@@ -0,0 +1,269 @@
+#include "cache.h"
+#include "commit.h"
+#include "config.h"
+#include "revision.h"
+#include "list-objects.h"
+#include "oidset2.h"
+#include "list-objects-filter-all.h"
+#include "list-objects-filter-large.h"
+#include "list-objects-filter-sparse.h"
+#include "object-filter.h"
+
+int parse_filter_omit_all_blobs(struct object_filter_options *filter_options)
+{
+	if (object_filter_enabled(filter_options))
+		die(_("multiple object filter types cannot be combined"));
+
+	filter_options->omit_all_blobs = 1;
+	return 0;
+}
+
+int parse_filter_omit_large_blobs(struct object_filter_options *filter_options,
+				  const char *arg)
+{
+	if (object_filter_enabled(filter_options))
+		die(_("multiple object filter types cannot be combined"));
+
+	filter_options->omit_large_blobs = 1;
+
+	/* we allow "<digits>[kmg]" */
+	if (!git_parse_ulong(arg, &filter_options->large_byte_limit))
+		die(_("invalid size limit for large object filter"));
+
+	filter_options->large_byte_limit_string = strdup(arg);
+	return 0;
+}
+
+int parse_filter_use_blob(struct object_filter_options *filter_options,
+			  const char *arg)
+{
+	struct object_context oc;
+
+	if (object_filter_enabled(filter_options))
+		die(_("multiple object filter types cannot be combined"));
+
+	filter_options->use_blob = 1;
+
+	/*
+	 * The command line argument needs to resolve to an known OID
+	 * representing the content of the desired sparse-checkout file.
+	 *
+	 * We allow various syntax forms for the convenience of the user.
+	 * See sha1_name.c:get_sha1_with_context_1().
+	 *
+	 * Try to evaluate the arg locally in case they use one of the
+	 * convenience patterns.  This must resolve to a blob.
+	 */
+	if (get_sha1_with_context(arg, GET_SHA1_BLOB,
+				  filter_options->sparse_oid.hash, &oc)) {
+		/*
+		 * If that fails, keep the original string in case a client
+		 * command wants to send it to the server.  This allows the
+		 * client to name an OID for a blob they don't have.
+		 */
+		filter_options->sparse_value = strdup(arg);
+		oidcpy(&filter_options->sparse_oid, &null_oid);
+	} else {
+		/*
+		 * Round-trip the found OID to normalize it.
+		 */
+		filter_options->sparse_value =
+			strdup(oid_to_hex(&filter_options->sparse_oid));
+	}
+	return 0;
+}
+
+int parse_filter_use_path(struct object_filter_options *filter_options,
+			  const char *arg)
+{
+	if (object_filter_enabled(filter_options))
+		die(_("multiple object filter types cannot be combined"));
+
+	filter_options->use_path = 1;
+
+	/*
+	 * The command line argument needs to resolve to a local file
+	 * containing the content of the desired sparse-checkout file.
+	 */
+	filter_options->sparse_value = strdup(arg);
+	return 0;
+}
+
+int parse_filter_print_omitted(struct object_filter_options *filter_options)
+{
+	filter_options->print_omitted = 1;
+	return 0;
+}
+
+int parse_filter_print_missing(struct object_filter_options *filter_options)
+{
+	filter_options->print_missing = 1;
+	return 0;
+}
+
+int parse_filter_relax(struct object_filter_options *filter_options)
+{
+	filter_options->relax = 1;
+	return 0;
+}
+
+int opt_parse_filter_omit_all_blobs(const struct option *opt,
+				    const char *arg, int unset)
+{
+	struct object_filter_options *filter_options = opt->value;
+
+	assert(!arg);
+	assert(!unset);
+
+	return parse_filter_omit_all_blobs(filter_options);
+}
+
+int opt_parse_filter_omit_large_blobs(const struct option *opt,
+				      const char *arg, int unset)
+{
+	struct object_filter_options *filter_options = opt->value;
+
+	assert(arg);
+	assert(!unset);
+
+	return parse_filter_omit_large_blobs(filter_options, arg);
+}
+
+int opt_parse_filter_use_blob(const struct option *opt,
+			      const char *arg, int unset)
+{
+	struct object_filter_options *filter_options = opt->value;
+
+	assert(arg);
+	assert(!unset);
+
+	return parse_filter_use_blob(filter_options, arg);
+}
+
+int opt_parse_filter_use_path(const struct option *opt,
+			      const char *arg, int unset)
+{
+	struct object_filter_options *filter_options = opt->value;
+
+	assert(arg);
+	assert(!unset);
+
+	return parse_filter_use_path(filter_options, arg);
+}
+
+int opt_parse_filter_print_omitted(const struct option *opt,
+				   const char *arg, int unset)
+{
+	struct object_filter_options *filter_options = opt->value;
+
+	assert(!arg);
+	assert(!unset);
+
+	return parse_filter_print_omitted(filter_options);
+}
+
+int opt_parse_filter_print_missing(const struct option *opt,
+				   const char *arg, int unset)
+{
+	struct object_filter_options *filter_options = opt->value;
+
+	assert(!arg);
+	assert(!unset);
+
+	return parse_filter_print_missing(filter_options);
+}
+
+int opt_parse_filter_relax(const struct option *opt,
+			   const char *arg, int unset)
+{
+	struct object_filter_options *filter_options = opt->value;
+
+	assert(!arg);
+	assert(!unset);
+
+	return parse_filter_relax(filter_options);
+}
+
+int object_filter_hand_parse_arg(struct object_filter_options *filter_options,
+				 const char *arg,
+				 int allow_print_omitted,
+				 int allow_print_missing,
+				 int allow_relax)
+{
+	if (!strcmp(arg, ("--"CL_ARG_FILTER_OMIT_ALL_BLOBS))) {
+		parse_filter_omit_all_blobs(filter_options);
+		return 1;
+	}
+	if (skip_prefix(arg, ("--"CL_ARG_FILTER_OMIT_LARGE_BLOBS"="), &arg)) {
+		parse_filter_omit_large_blobs(filter_options, arg);
+		return 1;
+	}
+	if (skip_prefix(arg, ("--"CL_ARG_FILTER_USE_BLOB"="), &arg)) {
+		parse_filter_use_blob(filter_options, arg);
+		return 1;
+	}
+	if (skip_prefix(arg, ("--"CL_ARG_FILTER_USE_PATH"="), &arg)) {
+		parse_filter_use_path(filter_options, arg);
+		return 1;
+	}
+
+	if (allow_print_omitted &&
+	    !strcmp(arg, ("--"CL_ARG_FILTER_PRINT_OMITTED))) {
+		parse_filter_print_omitted(filter_options);
+		return 1;
+	}
+
+	if (allow_print_missing &&
+	    !strcmp(arg, ("--"CL_ARG_FILTER_PRINT_MISSING))) {
+		parse_filter_print_missing(filter_options);
+		return 1;
+	}
+
+	if (allow_relax && !strcmp(arg, ("--"CL_ARG_FILTER_RELAX))) {
+		parse_filter_relax(filter_options);
+		return 1;
+	}
+
+	return 0;
+}
+
+int object_filter_hand_parse_protocol(struct object_filter_options *filter_options,
+				      const char *arg,
+				      int allow_print_omitted,
+				      int allow_print_missing,
+				      int allow_relax)
+{
+	if (!strcmp(arg, CL_ARG_FILTER_OMIT_ALL_BLOBS)) {
+		parse_filter_omit_all_blobs(filter_options);
+		return 1;
+	}
+	if (skip_prefix(arg, (CL_ARG_FILTER_OMIT_LARGE_BLOBS" "), &arg)) {
+		parse_filter_omit_large_blobs(filter_options, arg);
+		return 1;
+	}
+	if (skip_prefix(arg, (CL_ARG_FILTER_USE_BLOB" "), &arg)) {
+		parse_filter_use_blob(filter_options, arg);
+		return 1;
+	}
+	if (skip_prefix(arg, (CL_ARG_FILTER_USE_PATH" "), &arg)) {
+		parse_filter_use_path(filter_options, arg);
+		return 1;
+	}
+
+	if (allow_print_omitted &&
+	    !strcmp(arg, CL_ARG_FILTER_PRINT_OMITTED)) {
+		parse_filter_print_omitted(filter_options);
+		return 1;
+	}
+	if (allow_print_missing &&
+	    !strcmp(arg, CL_ARG_FILTER_PRINT_MISSING)) {
+		parse_filter_print_missing(filter_options);
+		return 1;
+	}
+	if (allow_relax && !strcmp(arg, CL_ARG_FILTER_RELAX)) {
+		parse_filter_relax(filter_options);
+		return 1;
+	}
+
+	return 0;
+}
diff --git a/object-filter.h b/object-filter.h
new file mode 100644
index 0000000..13a638b
--- /dev/null
+++ b/object-filter.h
@@ -0,0 +1,173 @@
+#ifndef OBJECT_FILTER_H
+#define OBJECT_FILTER_H
+
+#include "parse-options.h"
+
+/*
+ * Common declarations and utilities for filtering objects (such as omitting
+ * large blobs) in list_objects:traverse_commit_list() and git-rev-list.
+ */
+
+struct object_filter_options {
+	/*
+	 * File pathname or blob-ish path/OID (that get_sha1_with_context() can
+	 * use to find the blob containing the sparse-checkout specification.
+	 * This is only used when use_blob or use_path is set.
+	 */
+	const char *sparse_value;
+	struct object_id sparse_oid;
+
+	/*
+	 * Blob size byte limit for filtering.  Only blobs smaller than this
+	 * value will be included.  A value of zero, omits all blobs.
+	 * only used when omit_large_blobs is set.  Integer and string versions
+	 * of this are kept for convenience.  The string version may contain
+	 * a [kmg] suffix.
+	 */
+	unsigned long large_byte_limit;
+	const char *large_byte_limit_string;
+
+	/* Valid filter types (only one may be used at a time) */
+	unsigned omit_all_blobs : 1;
+	unsigned omit_large_blobs : 1;
+	unsigned use_blob : 1;
+	unsigned use_path : 1;
+
+	/*
+	 * True if rev-list should print a list of the objects omitted
+	 * by this invocation of a filter.
+	 */
+	unsigned print_omitted : 1;
+
+	/*
+	 * True if rev-list should print a list of missing objects.
+	 * Objects can be missing because of a previously filtered
+	 * clone or fetch. The set reported here can also be filtered
+	 * by the current filter in effect.
+	 */
+	unsigned print_missing : 1;
+
+	/* True to suppress missing object errors during consistency checks */
+	unsigned relax : 1;
+};
+
+/*
+ * Return true if a filter is enabled.
+ */
+inline int object_filter_enabled(const struct object_filter_options *p)
+{
+	return p->omit_all_blobs ||
+		p->omit_large_blobs ||
+		p->use_blob ||
+		p->use_path;
+}
+
+/* Normalized command line arguments */
+#define CL_ARG_FILTER_OMIT_ALL_BLOBS     "filter-omit-all-blobs"
+#define CL_ARG_FILTER_OMIT_LARGE_BLOBS   "filter-omit-large-blobs"
+#define CL_ARG_FILTER_USE_BLOB           "filter-use-blob"
+#define CL_ARG_FILTER_USE_PATH           "filter-use-path"
+#define CL_ARG_FILTER_PRINT_OMITTED      "filter-print-omitted"
+#define CL_ARG_FILTER_PRINT_MISSING      "filter-print-missing"
+#define CL_ARG_FILTER_RELAX              "filter-relax"
+
+/*
+ * Common command line argument parsing for object-filter-related
+ * arguments (whether from a hand-parsed or parse-options style
+ * parser.
+ */
+int parse_filter_omit_all_blobs(struct object_filter_options *filter_options);
+int parse_filter_omit_large_blobs(struct object_filter_options *filter_options,
+				  const char *arg);
+int parse_filter_use_blob(struct object_filter_options *filter_options,
+			  const char *arg);
+int parse_filter_use_path(struct object_filter_options *filter_options,
+			  const char *arg);
+int parse_filter_print_omitted(struct object_filter_options *filter_options);
+int parse_filter_print_missing(struct object_filter_options *filter_options);
+int parse_filter_relax(struct object_filter_options *filter_options);
+
+/*
+ * Common command line argument parsers for object-filter-related
+ * arguments comming from parse-options style parsers.
+ */
+
+int opt_parse_filter_omit_all_blobs(const struct option *opt,
+				    const char *arg, int unset);
+int opt_parse_filter_omit_large_blobs(const struct option *opt,
+				      const char *arg, int unset);
+int opt_parse_filter_use_blob(const struct option *opt,
+			      const char *arg, int unset);
+int opt_parse_filter_use_path(const struct option *opt,
+			      const char *arg, int unset);
+int opt_parse_filter_print_omitted(const struct option *opt,
+				   const char *arg, int unset);
+int opt_parse_filter_print_missing(const struct option *opt,
+				   const char *arg, int unset);
+int opt_parse_filter_relax(const struct option *opt,
+			   const char *arg, int unset);
+
+#define OPT_PARSE_FILTER_OMIT_ALL_BLOBS(fo) \
+	{ OPTION_CALLBACK, 0, CL_ARG_FILTER_OMIT_ALL_BLOBS, fo, NULL, \
+	  N_("omit all blobs from result"), PARSE_OPT_NOARG | PARSE_OPT_NONEG, \
+	  opt_parse_filter_omit_all_blobs }
+
+#define OPT_PARSE_FILTER_OMIT_LARGE_BLOBS(fo) \
+	{ OPTION_CALLBACK, 0, CL_ARG_FILTER_OMIT_LARGE_BLOBS, fo, N_("size"), \
+	  N_("omit large blobs from result"), PARSE_OPT_NONEG, \
+	  opt_parse_filter_omit_large_blobs }
+
+#define OPT_PARSE_FILTER_USE_BLOB(fo) \
+	{ OPTION_CALLBACK, 0, CL_ARG_FILTER_USE_BLOB, fo, N_("object"), \
+	  N_("filter results using sparse-checkout specification"), PARSE_OPT_NONEG, \
+	  opt_parse_filter_use_blob }
+
+#define OPT_PARSE_FILTER_USE_PATH(fo) \
+	{ OPTION_CALLBACK, 0, CL_ARG_FILTER_USE_PATH, fo, N_("path"), \
+	  N_("filter results using sparse-checkout specification"), PARSE_OPT_NONEG, \
+	  opt_parse_filter_use_path }
+
+#define OPT_PARSE_FILTER_PRINT_OMITTED(fo) \
+	{ OPTION_CALLBACK, 0, CL_ARG_FILTER_PRINT_OMITTED, fo, NULL,	\
+	  N_("print list of omitted objects"), PARSE_OPT_NOARG | PARSE_OPT_NONEG, \
+	  opt_parse_filter_print_omitted }
+
+#define OPT_PARSE_FILTER_PRINT_MISSING(fo) \
+	{ OPTION_CALLBACK, 0, CL_ARG_FILTER_PRINT_MISSING, fo, NULL,	\
+	  N_("print list of missing objects"), PARSE_OPT_NOARG | PARSE_OPT_NONEG, \
+	  opt_parse_filter_print_missing }
+
+#define OPT_PARSE_FILTER_RELAX(fo) \
+	{ OPTION_CALLBACK, 0, CL_ARG_FILTER_RELAX, fo, NULL, \
+	  N_("relax consistency checks for previously omitted objects"), \
+	  PARSE_OPT_NOARG | PARSE_OPT_NONEG, opt_parse_filter_relax }
+
+/*
+ * Hand parse known object-filter command line options.
+ * Use this when the caller DOES NOT use the normal OPT_
+ * routines.
+ *
+ * Here we assume args of the form "--<key>" or "--<key>=<value>".
+ * Note the literal dash-dash and equals.
+ *
+ * Returns 1 if we handled the argument.
+ */
+int object_filter_hand_parse_arg(struct object_filter_options *filter_options,
+				 const char *arg,
+				 int allow_print_omitted,
+				 int allow_print_missing,
+				 int allow_relax);
+
+/*
+ * Hand parse known object-filter protocol lines.
+ *
+ * Here we assume args of the form "<key>" or "<key> <value>".
+ * Note the literal space before between the key and value.
+ */ 
+int object_filter_hand_parse_protocol(struct object_filter_options *filter_options,
+				      const char *arg,
+				      int allow_print_omitted,
+				      int allow_print_missing,
+				      int allow_relax);
+
+#endif /* OBJECT_FILTER_H */
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 08/13] list-objects: add traverse_commit_list_filtered method
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 06/13] list-objects-filter-sparse: add sparse-checkout based filter Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 07/13] object-filter: common declarations for object filtering Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 09/13] rev-list: add object filtering support Jeff Hostetler
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Add traverse_commit_list_filtered() wrapper around the various
filter methods using common data in object_filter_options.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 list-objects.c | 34 ++++++++++++++++++++++++++++++++++
 list-objects.h | 11 +++++++++++
 2 files changed, 45 insertions(+)

diff --git a/list-objects.c b/list-objects.c
index 3e86008..0f063d9 100644
--- a/list-objects.c
+++ b/list-objects.c
@@ -7,6 +7,9 @@
 #include "tree-walk.h"
 #include "revision.h"
 #include "list-objects.h"
+#include "list-objects-filter-all.h"
+#include "list-objects-filter-large.h"
+#include "list-objects-filter-sparse.h"
 
 static void process_blob(struct rev_info *revs,
 			 struct blob *blob,
@@ -266,3 +269,34 @@ void traverse_commit_list(struct rev_info *revs,
 		show_commit, show_object, show_data,
 		NULL, NULL);
 }
+
+void traverse_commit_list_filtered(
+	struct object_filter_options *filter_options,
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *show_data)
+{
+	if (filter_options->omit_all_blobs)
+		traverse_commit_list_omit_all_blobs(
+			revs, show_commit, show_object, print_omitted_object, show_data);
+
+	else if (filter_options->omit_large_blobs)
+		traverse_commit_list_omit_large_blobs(
+			revs, show_commit, show_object, print_omitted_object, show_data,
+			(int64_t)(uint64_t)filter_options->large_byte_limit);
+
+	else if (filter_options->use_blob)
+		traverse_commit_list_use_blob(
+			revs, show_commit, show_object, print_omitted_object, show_data,
+			&filter_options->sparse_oid);
+
+	else if (filter_options->use_path)
+		traverse_commit_list_use_path(
+			revs, show_commit, show_object, print_omitted_object, show_data,
+			filter_options->sparse_value);
+
+	else
+		die("unspecified list-objects filter");
+}
diff --git a/list-objects.h b/list-objects.h
index 39fcbb5..a8acedc 100644
--- a/list-objects.h
+++ b/list-objects.h
@@ -1,6 +1,9 @@
 #ifndef LIST_OBJECTS_H
 #define LIST_OBJECTS_H
 
+#include "oidset2.h"
+#include "object-filter.h"
+
 typedef void (*show_commit_fn)(struct commit *, void *);
 typedef void (*show_object_fn)(struct object *, const char *, void *);
 void traverse_commit_list(struct rev_info *, show_commit_fn, show_object_fn, void *);
@@ -38,4 +41,12 @@ void traverse_commit_list_worker(
 	show_commit_fn, show_object_fn, void *show_data,
 	filter_object_fn filter, void *filter_data);
 
+void traverse_commit_list_filtered(
+	struct object_filter_options *filter_options,
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *show_data);
+
 #endif
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 09/13] rev-list: add object filtering support
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
                   ` (2 preceding siblings ...)
  2017-09-22 20:30 ` [PATCH 08/13] list-objects: add traverse_commit_list_filtered method Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  2017-09-26 22:44   ` Jonathan Tan
  2017-09-22 20:30 ` [PATCH 10/13] rev-list: add filtering help text Jeff Hostetler
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Teach rev-list to use the filtering provided by the
traverse_commit_list_filtered() interface to omit
unwanted objects from the result.

This feature is only enabled when one of the "--objects*"
options are used.

When the "--filter-print-manifest" option is used, the
omitted objects and their sizes are printed at the end.
These are marked with a "~".  This can be combined with
"--quiet" to get a list of just the omitted objects.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 builtin/rev-list.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 2 deletions(-)

diff --git a/builtin/rev-list.c b/builtin/rev-list.c
index 95d84d5..5f54495 100644
--- a/builtin/rev-list.c
+++ b/builtin/rev-list.c
@@ -53,6 +53,8 @@ static const char rev_list_usage[] =
 
 static struct progress *progress;
 static unsigned progress_counter;
+static struct object_filter_options filter_options;
+static struct oidset2 missing_objects;
 
 static void finish_commit(struct commit *commit, void *data);
 static void show_commit(struct commit *commit, void *data)
@@ -179,8 +181,25 @@ static void finish_commit(struct commit *commit, void *data)
 static void finish_object(struct object *obj, const char *name, void *cb_data)
 {
 	struct rev_list_info *info = cb_data;
-	if (obj->type == OBJ_BLOB && !has_object_file(&obj->oid))
+	if (obj->type == OBJ_BLOB && !has_object_file(&obj->oid)) {
+		if (filter_options.print_missing) {
+			oidset2_insert(&missing_objects, &obj->oid, obj->type,
+				       -1, name);
+			return;
+		}
+		if (filter_options.relax) {
+			/*
+			 * Relax consistency checks to not complain about
+			 * omitted objects (presumably caused by use of
+			 * the previous use of the 'filter-objects' feature).
+			 *
+			 * Note that this is independent of any filtering that
+			 * we are doing in this run.
+			 */
+			return;
+		}
 		die("missing blob object '%s'", oid_to_hex(&obj->oid));
+	}
 	if (info->revs->verify_objects && !obj->parsed && obj->type != OBJ_COMMIT)
 		parse_object(&obj->oid);
 }
@@ -200,6 +219,25 @@ static void show_edge(struct commit *commit)
 	printf("-%s\n", oid_to_hex(&commit->object.oid));
 }
 
+static void print_omitted_object(int i, int i_limit, struct oidset2_entry *e, void *cb_data)
+{
+	/* struct rev_list_info *info = cb_data; */
+	const char *tn = typename(e->type);
+
+	if (e->object_length == -1)
+		printf("~%s %s\n", oid_to_hex(&e->oid), tn);
+	else
+		printf("~%s %s %"PRIuMAX"\n", oid_to_hex(&e->oid), tn, e->object_length);
+}
+
+static void print_missing_object(int i, int i_limit, struct oidset2_entry *e, void *cb_data)
+{
+	/* struct rev_list_info *info = cb_data; */
+	const char *tn = typename(e->type);
+
+	printf("?%s %s\n", oid_to_hex(&e->oid), tn);
+}
+
 static void print_var_str(const char *var, const char *val)
 {
 	printf("%s='%s'\n", var, val);
@@ -333,6 +371,15 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
 			show_progress = arg;
 			continue;
 		}
+		if (object_filter_hand_parse_arg(
+			    &filter_options, arg, 1, 1, 1)) {
+			if (!revs.blob_objects)
+				die(_("object filtering requires --objects"));
+			if (filter_options.use_blob &&
+			    !oidcmp(&filter_options.sparse_oid, &null_oid))
+				die(_("invalid sparse value"));
+			continue;
+		}
 		usage(rev_list_usage);
 
 	}
@@ -357,6 +404,11 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
 	if (revs.show_notes)
 		die(_("rev-list does not support display of notes"));
 
+	if (object_filter_enabled(&filter_options)) {
+		if (use_bitmap_index)
+			die(_("cannot combine --use-bitmap-index with object filtering"));
+	}
+
 	save_commit_buffer = (revs.verbose_header ||
 			      revs.grep_filter.pattern_list ||
 			      revs.grep_filter.header_list);
@@ -401,7 +453,24 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
 			return show_bisect_vars(&info, reaches, all);
 	}
 
-	traverse_commit_list(&revs, show_commit, show_object, &info);
+	if (filter_options.print_missing)
+		memset(&missing_objects, 0, sizeof(missing_objects));
+
+	if (object_filter_enabled(&filter_options))
+		traverse_commit_list_filtered(
+			&filter_options, &revs,
+			show_commit, show_object,
+			(filter_options.print_omitted
+			 ? print_omitted_object
+			 : NULL),
+			&info);
+	else
+		traverse_commit_list(&revs, show_commit, show_object, &info);
+
+	if (filter_options.print_missing) {
+		oidset2_foreach(&missing_objects, print_missing_object, &info);
+		oidset2_clear(&missing_objects);
+	}
 
 	stop_progress(&progress);
 
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 10/13] rev-list: add filtering help text
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
                   ` (3 preceding siblings ...)
  2017-09-22 20:30 ` [PATCH 09/13] rev-list: add object filtering support Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 11/13] t6112: rev-list object filtering test Jeff Hostetler
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 Documentation/git-rev-list.txt     |  9 ++++++++-
 Documentation/rev-list-options.txt | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-rev-list.txt b/Documentation/git-rev-list.txt
index ef22f17..b2e8255 100644
--- a/Documentation/git-rev-list.txt
+++ b/Documentation/git-rev-list.txt
@@ -47,7 +47,14 @@ SYNOPSIS
 	     [ --fixed-strings | -F ]
 	     [ --date=<format>]
 	     [ [ --objects | --objects-edge | --objects-edge-aggressive ]
-	       [ --unpacked ] ]
+	       [ --unpacked ]
+	       [ [ --filter-omit-all-blobs |
+		   --filter-omit-large-blobs=<n>[kmg] |
+		   --filter-use-blob=<blob-ish> |
+		   --filter-use-path=<path> ]
+		 [ --filter-print-missing ]
+		 [ --filter-print-omitted ] ] ]
+	     [ --filter-relax ]
 	     [ --pretty | --header ]
 	     [ --bisect ]
 	     [ --bisect-vars ]
diff --git a/Documentation/rev-list-options.txt b/Documentation/rev-list-options.txt
index a6cf9eb..5c7d53c 100644
--- a/Documentation/rev-list-options.txt
+++ b/Documentation/rev-list-options.txt
@@ -698,6 +698,38 @@ ifdef::git-rev-list[]
 --unpacked::
 	Only useful with `--objects`; print the object IDs that are not
 	in packs.
+
+--filter-omit-all-blobs::
+	Only useful with one of the `--objects*`; omits all blobs from
+	the printed list of objects.
+
+--filter-omit-large-blobs=<n>[kmg]::
+	Only useful with one of the `--objects*`; omits blobs larger than
+	n bytes from the printed list of objects.  May optionally be
+	followed by 'k', 'm', or 'g' units.  Value may be zero.  Special
+	files (matching ".git*") are always included, regardless of size.
+
+--filter-use-blob=<blob-ish>::
+--filter-use-path=<path>::
+	Only useful with one of the `--objects*`; uses a sparse-checkout
+	specification contained in the given object or file to filter the
+	result to only contain blobs referenced by such a sparse-checkout.
+
+--filter-print-missing::
+	Prints a list of the missing objects for the requested traversal.
+	Object IDs are prefixed with a ``?'' character.  The object type
+	is printed after the ID.  This may be used with or without any of
+	the above filtering options.
+
+--filter-print-omitted::
+	Only useful with one of the above `--filter*`; prints a list
+	of the omitted objects.  Object IDs are prefixed with a ``~''
+	character.  The object size is printed after the ID.
+
+--filter-relax::
+	Relax consistency checking for missing blobs.  Do not warn of
+	missing blobs during normal (non-filtering) object traversal
+	following an earlier partial/narrow clone or fetch.
 endif::git-rev-list[]
 
 --no-walk[=(sorted|unsorted)]::
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 11/13] t6112: rev-list object filtering test
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
                   ` (4 preceding siblings ...)
  2017-09-22 20:30 ` [PATCH 10/13] rev-list: add filtering help text Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 12/13] pack-objects: add object filtering support Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 13/13] pack-objects: add filtering help text Jeff Hostetler
  7 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 t/t6112-rev-list-filters-objects.sh | 237 ++++++++++++++++++++++++++++++++++++
 1 file changed, 237 insertions(+)
 create mode 100755 t/t6112-rev-list-filters-objects.sh

diff --git a/t/t6112-rev-list-filters-objects.sh b/t/t6112-rev-list-filters-objects.sh
new file mode 100755
index 0000000..66ff022
--- /dev/null
+++ b/t/t6112-rev-list-filters-objects.sh
@@ -0,0 +1,237 @@
+#!/bin/sh
+
+test_description='git rev-list with object filtering'
+
+. ./test-lib.sh
+
+# test the omit-all filter
+
+test_expect_success 'setup' '
+	echo "{print \$1}" >print_1.awk &&
+	echo "{print \$2}" >print_2.awk &&
+
+	for n in 1 2 3 4 5
+	do
+		echo $n > file.$n
+		git add file.$n
+		git commit -m "$n"
+	done
+'
+
+# Verify the omitted ("~OID") lines match the predicted list of OIDs.
+test_expect_success 'omit-all-blobs omitted 5 blobs' '
+	git ls-files -s file.1 file.2 file.3 file.4 file.5 \
+		| awk -f print_2.awk \
+		| sort >expected &&
+	git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-all-blobs \
+		| awk -f print_1.awk \
+		| sed "s/~//" >observed &&
+	test_cmp observed expected
+'
+
+# Verify the complete OID list matches the unfiltered OIDs plus the omitted OIDs.
+test_expect_success 'omit-all-blobs nothing else changed' '
+	git rev-list HEAD --objects \
+		| awk -f print_1.awk \
+		| sort >expected &&
+	git rev-list HEAD --objects --filter-print-omitted --filter-omit-all-blobs \
+		| awk -f print_1.awk \
+		| sed "s/~//" \
+		| sort >observed &&
+	test_cmp observed expected
+'
+
+# test the size-based filtering.
+
+test_expect_success 'setup_large' '
+	for n in 1000 10000
+	do
+		printf "%"$n"s" X > large.$n
+		git add large.$n
+		git commit -m "$n"
+	done
+'
+
+test_expect_success 'omit-large-blobs omit 2 blobs' '
+	git ls-files -s large.1000 large.10000 \
+		| awk -f print_2.awk \
+		| sort >expected &&
+	git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=500 \
+		| awk -f print_1.awk \
+		| sed "s/~//" >observed &&
+	test_cmp observed expected
+'
+
+test_expect_success 'omit-large-blobs nothing else changed' '
+	git rev-list HEAD --objects \
+		| awk -f print_1.awk \
+		| sort >expected &&
+	git rev-list HEAD --objects --filter-print-omitted --filter-omit-large-blobs=500 \
+		| awk -f print_1.awk \
+		| sed "s/~//" \
+		| sort >observed &&
+	test_cmp observed expected
+'
+
+# boundary test around the size parameter.
+# filter is strictly less than the value, so size 500 and 1000 should have the
+# same results, but 1001 should filter more.
+
+test_expect_success 'omit-large-blobs omit 2 blobs' '
+	git ls-files -s large.1000 large.10000 \
+		| awk -f print_2.awk \
+		| sort >expected &&
+	git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=1000 \
+		| awk -f print_1.awk \
+		| sed "s/~//" >observed &&
+	test_cmp observed expected
+'
+
+test_expect_success 'omit-large-blobs omit 1 blob' '
+	git ls-files -s large.10000 \
+		| awk -f print_2.awk \
+		| sort >expected &&
+	git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=1001 \
+		| awk -f print_1.awk \
+		| sed "s/~//" >observed &&
+	test_cmp observed expected
+'
+
+test_expect_success 'omit-large-blobs omit 1 blob (1k)' '
+	git ls-files -s large.10000 \
+		| awk -f print_2.awk \
+		| sort >expected &&
+	git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=1k \
+		| awk -f print_1.awk \
+		| sed "s/~//" >observed &&
+	test_cmp observed expected
+'
+
+test_expect_success 'omit-large-blobs omit no blob (1m)' '
+	cat </dev/null >expected &&
+	git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=1m \
+		| awk -f print_1.awk \
+		| sed "s/~//" >observed &&
+	test_cmp observed expected
+'
+
+# Test sparse-pattern filtering (using explicit local patterns).
+# We use the same disk format as sparse-checkout to specify the
+# filtering, but do not require sparse-checkout to be enabled.
+
+test_expect_success 'setup using sparse file' '
+	mkdir dir1 &&
+	for n in sparse1 sparse2
+	do
+		echo $n > $n
+		git add $n
+		echo dir1/$n > dir1/$n
+		git add dir1/$n
+	done &&
+	git commit -m "sparse" &&
+	echo dir1/ >pattern1 &&
+	echo sparse1 >pattern2
+'
+
+# pattern1 should only include the 2 dir1/* files.
+# and omit the 5 file.*, 2 large.*, and 2 top-level sparse* files.
+test_expect_success 'sparse using path pattern1' '
+	git rev-list HEAD --objects --filter-print-omitted --filter-use-path=pattern1 >out &&
+
+	grep "^~" out >blobs.omitted &&
+	test $(cat blobs.omitted | wc -l) = 9 &&
+
+	grep "dir1/sparse" out >blobs.included &&
+	test $(cat blobs.included | wc -l) = 2
+'
+
+# pattern2 should include the sparse1 and dir1/sparse1.
+# and omit the 5 file.*, 2 large.*, and the 2 sparse2 files.
+test_expect_success 'sparse using path pattern2' '
+	git rev-list HEAD --objects --filter-print-omitted --filter-use-path=pattern2 >out &&
+
+	grep "^~" out >blobs.omitted &&
+	test $(cat blobs.omitted | wc -l) = 9 &&
+
+	grep "sparse1" out >blobs.included &&
+	test $(cat blobs.included | wc -l) = 2
+'
+
+# Test sparse-pattern filtering (using a blob in the repo).
+# This could be used to later let pack-objects do filtering.
+
+# pattern1 should only include the 2 dir1/* files.
+# and omit the 5 file.*, 2 large.*, 2 top-level sparse*, and 1 pattern file.
+test_expect_success 'sparse using OID for pattern1' '
+	git add pattern1 &&
+	git commit -m "pattern1" &&
+
+	git rev-list HEAD --objects >normal.output &&
+	grep "pattern1" <normal.output | awk "{print \$1;}" >pattern1.oid &&
+
+	git rev-list HEAD --objects --filter-print-omitted --filter-use-blob=`cat pattern1.oid` >out &&
+
+	grep "^~" out >blobs.omitted &&
+	test $(cat blobs.omitted | wc -l) = 10 &&
+
+	grep "dir1/sparse" out >blobs.included &&
+	test $(cat blobs.included | wc -l) = 2
+'
+
+# repeat previous test but use blob-ish expression rather than OID.
+test_expect_success 'sparse using blob-ish to get OID for pattern spec' '
+	git rev-list HEAD --objects --filter-print-omitted --filter-use-blob=HEAD:pattern1 >out &&
+
+	grep "^~" out >blobs.omitted &&
+	test $(cat blobs.omitted | wc -l) = 10 &&
+
+	grep "dir1/sparse" out >blobs.included &&
+	test $(cat blobs.included | wc -l) = 2
+'
+
+# pattern2 should include the sparse1 and dir1/sparse1.
+# and omit the 5 file.*, 2 large.*, 2 top-level sparse*, and 2 pattern files.
+test_expect_success 'sparse using OID for pattern2' '
+	git add pattern2 &&
+	git commit -m "pattern2" &&
+
+	git rev-list HEAD --objects >normal.output &&
+	grep "pattern2" <normal.output | awk "{print \$1;}" >pattern2.oid &&
+
+	git rev-list HEAD --objects --filter-print-omitted --filter-use-blob=`cat pattern2.oid` >out &&
+
+	grep "^~" out >blobs.omitted &&
+	test $(cat blobs.omitted | wc -l) = 11 &&
+
+	grep "sparse1" out >blobs.included &&
+	test $(cat blobs.included | wc -l) = 2
+'
+
+# repeat previous test but use blob-ish expression rather than OID.
+test_expect_success 'sparse using blob-ish rather than OID for pattern2' '
+	git rev-list HEAD --objects --filter-print-omitted --filter-use-blob=HEAD:pattern2 >out &&
+
+	grep "^~" out >blobs.omitted &&
+	test $(cat blobs.omitted | wc -l) = 11 &&
+
+	grep "sparse1" out >blobs.included &&
+	test $(cat blobs.included | wc -l) = 2
+'
+
+# delete some loose objects and test rev-list printing them as missing.
+test_expect_success 'print missing objects' '
+	git ls-files -s file.1 file.2 file.3 file.4 file.5 \
+		| awk -f print_2.awk \
+		| sort >expected &&
+	for id in `cat expected | sed "s|..|&/|"`
+	do
+		rm .git/objects/$id
+	done &&
+	git rev-list --quiet HEAD --filter-print-missing --objects \
+		| awk -f print_1.awk \
+		| sed "s/?//" \
+		| sort >observed &&
+	test_cmp observed expected
+'
+
+test_done
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 12/13] pack-objects: add object filtering support
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
                   ` (5 preceding siblings ...)
  2017-09-22 20:30 ` [PATCH 11/13] t6112: rev-list object filtering test Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 13/13] pack-objects: add filtering help text Jeff Hostetler
  7 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Teach pack-objects to use the filtering provided by the
traverse_commit_list_filtered() interface to omit unwanted
objects from the resulting packfile.

This feature is intended for partial clone/fetch.

Filtering requires the use of the "--stdout" option.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 builtin/pack-objects.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index f4a8441..1712d1b 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -78,6 +78,8 @@ static unsigned long cache_max_small_delta_size = 1000;
 
 static unsigned long window_memory_limit = 0;
 
+static struct object_filter_options filter_options;
+
 /*
  * stats
  */
@@ -2810,7 +2812,12 @@ static void get_object_list(int ac, const char **av)
 	if (prepare_revision_walk(&revs))
 		die("revision walk setup failed");
 	mark_edges_uninteresting(&revs, show_edge);
-	traverse_commit_list(&revs, show_commit, show_object, NULL);
+	if (object_filter_enabled(&filter_options))
+		traverse_commit_list_filtered(&filter_options, &revs,
+					      show_commit, show_object,
+					      NULL, NULL);
+	else
+		traverse_commit_list(&revs, show_commit, show_object, NULL);
 
 	if (unpack_unreachable_expiration) {
 		revs.ignore_missing_links = 1;
@@ -2946,6 +2953,15 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 			 N_("use a bitmap index if available to speed up counting objects")),
 		OPT_BOOL(0, "write-bitmap-index", &write_bitmap_index,
 			 N_("write a bitmap index together with the pack index")),
+
+		OPT_PARSE_FILTER_OMIT_ALL_BLOBS(&filter_options),
+		OPT_PARSE_FILTER_OMIT_LARGE_BLOBS(&filter_options),
+		OPT_PARSE_FILTER_USE_BLOB(&filter_options),
+		OPT_PARSE_FILTER_USE_PATH(&filter_options),
+		/* not needed: OPT_PARSE_FILTER_PRINT_MISSING */
+		/* not needed: OPT_PARSE_FILTER_PRINT_OMITTED */
+		/* not needed: OPT_PARSE_FILTER_RELAX */
+
 		OPT_END(),
 	};
 
@@ -3022,6 +3038,12 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (!rev_list_all || !rev_list_reflog || !rev_list_index)
 		unpack_unreachable_expiration = 0;
 
+	if (object_filter_enabled(&filter_options)) {
+		if (!pack_to_stdout)
+			die("cannot use filtering with an indexable pack.");
+		use_bitmap_index = 0;
+	}
+
 	/*
 	 * "soft" reasons not to use bitmaps - for on-disk repack by default we want
 	 *
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 13/13] pack-objects: add filtering help text
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
                   ` (6 preceding siblings ...)
  2017-09-22 20:30 ` [PATCH 12/13] pack-objects: add object filtering support Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  7 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Add help text for new object filtering options to
pack-objects documentation.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 Documentation/git-pack-objects.txt | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 8973510..1ed7d4b 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -231,6 +231,23 @@ So does `git bundle` (see linkgit:git-bundle[1]) when it creates a bundle.
 	With this option, parents that are hidden by grafts are packed
 	nevertheless.
 
+--filter-omit-all-blobs::
+	Requires `--stdout`.  Omits all blobs from the packfile.
+
+--filter-omit-large-blobs=<n>[kmg]::
+	Requires `--stdout`.  Omits large blobs larger than n bytes from
+	the packfile.  May optionally be followed by 'k', 'm', or 'g' units.
+	Value may be zero.  Special files (matching ".git*") are always
+	included, regardless of size.
+
+--filter-use-blob=<blob-ish>::
+--filter-use-path=<path>::
+	Requires `--stdout`.  Use a sparse-checkout specification to
+	filter the resulting packfile to only contain the blobs that
+	would be referenced by such a sparse-checkout.  `<path>` specifies
+	a local pathname.  `<blob-ish>` specifies an expression that can
+	be evaluated to a blob.
+
 SEE ALSO
 --------
 linkgit:git-rev-list[1]
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 07/13] object-filter: common declarations for object filtering
  2017-09-22 20:30 ` [PATCH 07/13] object-filter: common declarations for object filtering Jeff Hostetler
@ 2017-09-26 22:39   ` Jonathan Tan
  2017-09-27 17:09     ` Jeff Hostetler
  0 siblings, 1 reply; 16+ messages in thread
From: Jonathan Tan @ 2017-09-26 22:39 UTC (permalink / raw)
  To: Jeff Hostetler; +Cc: git, gitster, peff, Jeff Hostetler

On Fri, 22 Sep 2017 20:30:11 +0000
Jeff Hostetler <git@jeffhostetler.com> wrote:

>  Makefile        |   1 +
>  object-filter.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  object-filter.h | 173 ++++++++++++++++++++++++++++++++++++
>  3 files changed, 443 insertions(+)
>  create mode 100644 object-filter.c
>  create mode 100644 object-filter.h

I think these and list-objects-filter-* are multiple levels of
indirection too many. Would a single file with a few implementations of
filter_object_fn be sufficient?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 09/13] rev-list: add object filtering support
  2017-09-22 20:30 ` [PATCH 09/13] rev-list: add object filtering support Jeff Hostetler
@ 2017-09-26 22:44   ` Jonathan Tan
  2017-09-27 17:26     ` Jeff Hostetler
  0 siblings, 1 reply; 16+ messages in thread
From: Jonathan Tan @ 2017-09-26 22:44 UTC (permalink / raw)
  To: Jeff Hostetler; +Cc: git, gitster, peff, Jeff Hostetler

On Fri, 22 Sep 2017 20:30:13 +0000
Jeff Hostetler <git@jeffhostetler.com> wrote:

> +		if (filter_options.relax) {

Add some documentation about how this differs from ignore_missing_links
in struct rev_info.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 07/13] object-filter: common declarations for object filtering
  2017-09-26 22:39   ` Jonathan Tan
@ 2017-09-27 17:09     ` Jeff Hostetler
  2017-09-28  0:05       ` Jonathan Tan
  0 siblings, 1 reply; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-27 17:09 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, gitster, peff, Jeff Hostetler



On 9/26/2017 6:39 PM, Jonathan Tan wrote:
> On Fri, 22 Sep 2017 20:30:11 +0000
> Jeff Hostetler <git@jeffhostetler.com> wrote:
> 
>>   Makefile        |   1 +
>>   object-filter.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   object-filter.h | 173 ++++++++++++++++++++++++++++++++++++
>>   3 files changed, 443 insertions(+)
>>   create mode 100644 object-filter.c
>>   create mode 100644 object-filter.h
> 
> I think these and list-objects-filter-* are multiple levels of
> indirection too many. Would a single file with a few implementations of
> filter_object_fn be sufficient?

I did that in my first draft and I found it confusing.

Each filter has 3 parts (some filter-specific data structures,
a filter callback routine, a driver to call the traverse code).
I found it easier to reason about each filter in isolation.
And it makes it easier to work on each independently and keep
their inclusion in separate commits.


  

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 09/13] rev-list: add object filtering support
  2017-09-26 22:44   ` Jonathan Tan
@ 2017-09-27 17:26     ` Jeff Hostetler
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-27 17:26 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, gitster, peff, Jeff Hostetler



On 9/26/2017 6:44 PM, Jonathan Tan wrote:
> On Fri, 22 Sep 2017 20:30:13 +0000
> Jeff Hostetler <git@jeffhostetler.com> wrote:
> 
>> +		if (filter_options.relax) {
> 
> Add some documentation about how this differs from ignore_missing_links
> in struct rev_info.
> 

It's unclear what that field means.  (It's not documented that I saw.)
And it is explicitly turned on and off in pack-bitmap.c, so again I'm
not sure how that agrees with what I'm doing.

My relax field is probably an interim thing (to allow rev-list, index-pack,
and friends not complain about missing objects during my development).

I suspect that you and I will need to merge our efforts here.  You have
a similar variable "revs->exclude_promisor_objects".  But I need to study
how our usages differ.

Thanks,
Jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 07/13] object-filter: common declarations for object filtering
  2017-09-27 17:09     ` Jeff Hostetler
@ 2017-09-28  0:05       ` Jonathan Tan
  2017-09-28 14:33         ` Jeff Hostetler
  0 siblings, 1 reply; 16+ messages in thread
From: Jonathan Tan @ 2017-09-28  0:05 UTC (permalink / raw)
  To: Jeff Hostetler; +Cc: git, gitster, peff, Jeff Hostetler

On Wed, 27 Sep 2017 13:09:42 -0400
Jeff Hostetler <git@jeffhostetler.com> wrote:

> On 9/26/2017 6:39 PM, Jonathan Tan wrote:
> > On Fri, 22 Sep 2017 20:30:11 +0000
> > Jeff Hostetler <git@jeffhostetler.com> wrote:
> > 
> >>   Makefile        |   1 +
> >>   object-filter.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>   object-filter.h | 173 ++++++++++++++++++++++++++++++++++++
> >>   3 files changed, 443 insertions(+)
> >>   create mode 100644 object-filter.c
> >>   create mode 100644 object-filter.h
> > 
> > I think these and list-objects-filter-* are multiple levels of
> > indirection too many. Would a single file with a few implementations of
> > filter_object_fn be sufficient?
> 
> I did that in my first draft and I found it confusing.
> 
> Each filter has 3 parts (some filter-specific data structures,
> a filter callback routine, a driver to call the traverse code).
> I found it easier to reason about each filter in isolation.
> And it makes it easier to work on each independently and keep
> their inclusion in separate commits.

I looked at object-filter.{c,h} a bit more. It seems that these files:
 1) define a struct that contains all the options that we would want
 2) supplies a way to populate this struct from code that uses parse-options
 3) supplies a way to populate this struct from code that calculates
    options by hand
 4) supplies a way to populate this struct from "protocol" ("<key>" or
    "<key> <value>" strings)

And the next commit takes the struct that object-filter.{c,h} produces
and actually performs the traversal.

I think this can be significantly simplified, though. Would this work:
 a) Define the object_filter_options struct, but make all fields
    correspond to a single parameter each. Define
    OBJECT_FILTER_OPTIONS_INIT to initialize everything to 0 except for
    large_byte_limit to ULONG_MAX (so that we can detect if something
    else is set to it).
 b) Define one single OPT_PARSE_FILTER macro containing all the
    parameters. We can use the non-callback macros here. That solves 2)
    above.
 c) Define a function that takes in (int *argc, char ***argv) that can
    "massage" it to remove all filter-related arguments, storing them in
    a object_filter_options struct. That solves 3) above. As discussed
    in the API documentation, this means that argument lists of the form
    "--unknown --known" (where "--unknown" takes an argument) are
    processed differently, but then again, rev-list never supported them
    anyway (it required "--unknown=<arg>").
 d) Define a function that converts "<key>" into "--<key>" and "<key>
    <value>" into "--<key>=<value>", and use the existing mechanism.
    That solves 4) above.

This removes the need to maintain the lists of one-per-argument
functions, including the parse_filter_* and opt_parse_filter_* functions
declared in the header file. If we were to add a feature, we wouldn't
need to change anything in the caller, and wouldn't need to hand-edit
object_filter_hand_parse_arg() and object_filter_hand_parse_protocol().

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 07/13] object-filter: common declarations for object filtering
  2017-09-28  0:05       ` Jonathan Tan
@ 2017-09-28 14:33         ` Jeff Hostetler
  2017-09-29 19:47           ` Jonathan Tan
  0 siblings, 1 reply; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-28 14:33 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, gitster, peff, Jeff Hostetler



On 9/27/2017 8:05 PM, Jonathan Tan wrote:
> On Wed, 27 Sep 2017 13:09:42 -0400
> Jeff Hostetler <git@jeffhostetler.com> wrote:
> 
>> On 9/26/2017 6:39 PM, Jonathan Tan wrote:
>>> On Fri, 22 Sep 2017 20:30:11 +0000
>>> Jeff Hostetler <git@jeffhostetler.com> wrote:
>>>
>>>>    Makefile        |   1 +
>>>>    object-filter.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    object-filter.h | 173 ++++++++++++++++++++++++++++++++++++
>>>>    3 files changed, 443 insertions(+)
>>>>    create mode 100644 object-filter.c
>>>>    create mode 100644 object-filter.h
>>>
>>> I think these and list-objects-filter-* are multiple levels of
>>> indirection too many. Would a single file with a few implementations of
>>> filter_object_fn be sufficient?
>>
>> I did that in my first draft and I found it confusing.
>>
>> Each filter has 3 parts (some filter-specific data structures,
>> a filter callback routine, a driver to call the traverse code).
>> I found it easier to reason about each filter in isolation.
>> And it makes it easier to work on each independently and keep
>> their inclusion in separate commits.
> 
> I looked at object-filter.{c,h} a bit more. It seems that these files:
>   1) define a struct that contains all the options that we would want
>   2) supplies a way to populate this struct from code that uses parse-options
>   3) supplies a way to populate this struct from code that calculates
>      options by hand
>   4) supplies a way to populate this struct from "protocol" ("<key>" or
>      "<key> <value>" strings)
> 
> And the next commit takes the struct that object-filter.{c,h} produces
> and actually performs the traversal.
> 
> I think this can be significantly simplified, though. Would this work:
>   a) Define the object_filter_options struct, but make all fields
>      correspond to a single parameter each. Define
>      OBJECT_FILTER_OPTIONS_INIT to initialize everything to 0 except for
>      large_byte_limit to ULONG_MAX (so that we can detect if something
>      else is set to it).
>   b) Define one single OPT_PARSE_FILTER macro containing all the
>      parameters. We can use the non-callback macros here. That solves 2)
>      above.
>   c) Define a function that takes in (int *argc, char ***argv) that can
>      "massage" it to remove all filter-related arguments, storing them in
>      a object_filter_options struct. That solves 3) above. As discussed
>      in the API documentation, this means that argument lists of the form
>      "--unknown --known" (where "--unknown" takes an argument) are
>      processed differently, but then again, rev-list never supported them
>      anyway (it required "--unknown=<arg>").
>   d) Define a function that converts "<key>" into "--<key>" and "<key>
>      <value>" into "--<key>=<value>", and use the existing mechanism.
>      That solves 4) above.
> 
> This removes the need to maintain the lists of one-per-argument
> functions, including the parse_filter_* and opt_parse_filter_* functions
> declared in the header file. If we were to add a feature, we wouldn't
> need to change anything in the caller, and wouldn't need to hand-edit
> object_filter_hand_parse_arg() and object_filter_hand_parse_protocol().

Maybe.  What I have here now is the result of adding these arguments to
rev-list and pack-objects (in the current patch series), and also to
fetch-pack, fetch, clone, upload-pack, index-pack, and the transport and
protocol code (in a follow-on patch series that I've omitted for the moment).
And there will probably be a few more, such as fsck, gc, and etc.  I hesitate
to refine the macros too much further until we've agreement on the overall
approach and terms.

Thanks
Jeff



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 07/13] object-filter: common declarations for object filtering
  2017-09-28 14:33         ` Jeff Hostetler
@ 2017-09-29 19:47           ` Jonathan Tan
  0 siblings, 0 replies; 16+ messages in thread
From: Jonathan Tan @ 2017-09-29 19:47 UTC (permalink / raw)
  To: Jeff Hostetler; +Cc: git, gitster, peff, Jeff Hostetler

On Thu, 28 Sep 2017 10:33:39 -0400
Jeff Hostetler <git@jeffhostetler.com> wrote:

> Maybe.  What I have here now is the result of adding these arguments to
> rev-list and pack-objects (in the current patch series), and also to
> fetch-pack, fetch, clone, upload-pack, index-pack, and the transport and
> protocol code (in a follow-on patch series that I've omitted for the moment).
> And there will probably be a few more, such as fsck, gc, and etc.  I hesitate
> to refine the macros too much further until we've agreement on the overall
> approach and terms.

Fair enough. My current opinion on the overall approach (others might
differ, of course):
 - Filtering based on a sparse checkout specification in rev-list and
   pack-objects sounds useful to me, and is worth the filtering
   mechanism.
 - Filtering based on size (or based on type) still doesn't seem useful
   to me in rev-list, but if we're going to implement the filtering
   mechanism anyway, we might as well use the mechanism.
 - Besides my comments in [1], I think the API could still be slightly
   better organized. For example, object-filter probably should be the
   one to define the traverse_ function that takes in struct
   object_filter_options, and optionally a set of excluded objects to
   populate.

[1] https://public-inbox.org/git/20170927170533.65498396e008fa148a3fda90@google.com/

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2017-09-29 19:47 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
2017-09-22 20:30 ` [PATCH 06/13] list-objects-filter-sparse: add sparse-checkout based filter Jeff Hostetler
2017-09-22 20:30 ` [PATCH 07/13] object-filter: common declarations for object filtering Jeff Hostetler
2017-09-26 22:39   ` Jonathan Tan
2017-09-27 17:09     ` Jeff Hostetler
2017-09-28  0:05       ` Jonathan Tan
2017-09-28 14:33         ` Jeff Hostetler
2017-09-29 19:47           ` Jonathan Tan
2017-09-22 20:30 ` [PATCH 08/13] list-objects: add traverse_commit_list_filtered method Jeff Hostetler
2017-09-22 20:30 ` [PATCH 09/13] rev-list: add object filtering support Jeff Hostetler
2017-09-26 22:44   ` Jonathan Tan
2017-09-27 17:26     ` Jeff Hostetler
2017-09-22 20:30 ` [PATCH 10/13] rev-list: add filtering help text Jeff Hostetler
2017-09-22 20:30 ` [PATCH 11/13] t6112: rev-list object filtering test Jeff Hostetler
2017-09-22 20:30 ` [PATCH 12/13] pack-objects: add object filtering support Jeff Hostetler
2017-09-22 20:30 ` [PATCH 13/13] pack-objects: add filtering help text Jeff Hostetler

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).