git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: Matheus Tavares <matheus.bernardino@usp.br>
To: git@vger.kernel.org
Cc: "Jonathan Nieder" <jrnieder@gmail.com>,
	"brian m. carlson" <sandals@crustytoothpaste.net>,
	"Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>,
	"Brandon Williams" <bwilliams.eng@gmail.com>,
	"Junio C Hamano" <gitster@pobox.com>
Subject: [RFC PATCH 3/3] grep: don't add submodules to the alternates list
Date: Tue, 17 Sep 2019 22:56:05 -0300	[thread overview]
Message-ID: <fc5f5fe95c10ac66d1234a025122c8866965021f.1568771073.git.matheus.bernardino@usp.br> (raw)
In-Reply-To: <cover.1568771073.git.matheus.bernardino@usp.br>

When grepping with --recurse-submodules, the object directory of the
submodule is added to the in-memory alternates list. This makes git need
to watch out for more packfiles which might bring bad consequences for
memory and performance.

Now that raw_object_store is a member of the struct repository, it's
possible to use the submodules' instances directly, without the need to
add its odbs to the alternates list. So let's instead pass the subrepo
down to the threads and replace function calls which use
the_repository by default for their more general counterparts. Also,
submodule cleanup must be refactored as calling repo_clear() at the end
of grep_submodules(), might free the subrepo's resources before threads
have finished working on it. To do that, let's keep track of the
workers progress and only call repo_clear() when it's safe to do so.

This change will also help future patches to re-enable threads in
non-worktree grep as less mutual exclusions will be needed. E.g.
grep_submodule()'s call to parse_object_or_die() won't conflict with
other calls to the same function by worker threads (as they should be
referencing different 'struct repository's).

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
---
 builtin/grep.c | 80 ++++++++++++++++++++++++++++++++++++--------------
 grep.c         | 26 ++++++++--------
 2 files changed, 70 insertions(+), 36 deletions(-)

diff --git a/builtin/grep.c b/builtin/grep.c
index 73ef00c426..0bb58c456b 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -43,6 +43,15 @@ static pthread_t *threads;
  */
 struct work_item {
 	struct grep_source source;
+
+	/*
+	 * Each worker thread is initialized with a 'struct grep_opt *opt' where
+	 * opt->repo points to the_repository. But a work item may refeer to a
+	 * subrepo. So the repository relative to each task is also passed to
+	 * the assigned thread and its opt->repo is updated when retrieving the
+	 * task.
+	 */
+	struct repository *repo;
 	char done;
 	struct strbuf out;
 };
@@ -65,6 +74,13 @@ static int todo_done;
 /* Has all work items been added? */
 static int all_work_added;
 
+/* Tracks the work progress in subrepos to correctly free them in the end */
+struct progress_in_subrepo {
+	struct repository subrepo;
+	int all_work_added;
+	int work_counter;
+};
+
 /* This lock protects all the variables above. */
 static pthread_mutex_t grep_mutex;
 
@@ -93,6 +109,8 @@ static int skip_first_line;
 
 static void add_work(struct grep_opt *opt, const struct grep_source *gs)
 {
+	struct progress_in_subrepo *p;
+
 	grep_lock();
 
 	while ((todo_end+1) % ARRAY_SIZE(todo) == todo_done) {
@@ -104,9 +122,15 @@ static void add_work(struct grep_opt *opt, const struct grep_source *gs)
 		grep_source_load_driver(&todo[todo_end].source,
 					opt->repo->index);
 	todo[todo_end].done = 0;
+	todo[todo_end].repo = opt->repo;
 	strbuf_reset(&todo[todo_end].out);
 	todo_end = (todo_end + 1) % ARRAY_SIZE(todo);
 
+	if (opt->repo != the_repository) {
+		p = container_of(opt->repo, struct progress_in_subrepo, subrepo);
+		p->work_counter++;
+	}
+
 	pthread_cond_signal(&cond_add);
 	grep_unlock();
 }
@@ -132,6 +156,7 @@ static struct work_item *get_work(void)
 
 static void work_done(struct work_item *w)
 {
+	struct progress_in_subrepo *p;
 	int old_done;
 
 	grep_lock();
@@ -156,6 +181,17 @@ static void work_done(struct work_item *w)
 
 			write_or_die(1, p, len);
 		}
+
+		if (w->repo != the_repository) {
+			p = container_of(w->repo, struct progress_in_subrepo,
+					 subrepo);
+			p->work_counter--;
+			if (p->work_counter == 0 && p->all_work_added) {
+				repo_clear(&p->subrepo);
+				free(p);
+			}
+		}
+
 		grep_source_clear(&w->source);
 	}
 
@@ -179,6 +215,7 @@ static void *run(void *arg)
 			break;
 
 		opt->output_priv = w;
+		opt->repo = w->repo;
 		hit |= grep_source(opt, &w->source);
 		grep_source_clear_data(&w->source);
 		work_done(w);
@@ -295,12 +332,14 @@ static int grep_cmd_config(const char *var, const char *value, void *cb)
 	return st;
 }
 
-static void *lock_and_read_oid_file(const struct object_id *oid, enum object_type *type, unsigned long *size)
+static void *lock_and_read_oid_file(struct repository *r,
+				    const struct object_id *oid,
+				    enum object_type *type, unsigned long *size)
 {
 	void *data;
 
 	grep_read_lock();
-	data = read_object_file(oid, type, size);
+	data = repo_read_object_file(r, oid, type, size);
 	grep_read_unlock();
 	return data;
 }
@@ -405,7 +444,8 @@ static int grep_submodule(struct grep_opt *opt,
 			  const struct object_id *oid,
 			  const char *filename, const char *path, int cached)
 {
-	struct repository subrepo;
+	struct progress_in_subrepo *p = xcalloc(1, sizeof(*p));
+	struct repository *subrepo = &p->subrepo;
 	struct repository *superproject = opt->repo;
 	const struct submodule *sub = submodule_from_path(superproject,
 							  &null_oid, path);
@@ -422,31 +462,21 @@ static int grep_submodule(struct grep_opt *opt,
 
 	if (!is_submodule_active(superproject, path)) {
 		grep_read_unlock();
+		free(p);
 		return 0;
 	}
 
-	if (repo_submodule_init(&subrepo, superproject, sub)) {
+	if (repo_submodule_init(subrepo, superproject, sub)) {
 		grep_read_unlock();
+		free(p);
 		return 0;
 	}
 
-	repo_read_gitmodules(&subrepo);
-
-	/*
-	 * NEEDSWORK: This adds the submodule's object directory to the list of
-	 * alternates for the single in-memory object store.  This has some bad
-	 * consequences for memory (processed objects will never be freed) and
-	 * performance (this increases the number of pack files git has to pay
-	 * attention to, to the sum of the number of pack files in all the
-	 * repositories processed so far).  This can be removed once the object
-	 * store is no longer global and instead is a member of the repository
-	 * object.
-	 */
-	add_to_alternates_memory(subrepo.objects->odb->path);
+	repo_read_gitmodules(subrepo);
 	grep_read_unlock();
 
 	memcpy(&subopt, opt, sizeof(subopt));
-	subopt.repo = &subrepo;
+	subopt.repo = subrepo;
 
 	if (oid) {
 		struct object *object;
@@ -455,10 +485,10 @@ static int grep_submodule(struct grep_opt *opt,
 		unsigned long size;
 		struct strbuf base = STRBUF_INIT;
 
-		object = parse_object_or_die(the_repository, oid, NULL);
+		object = parse_object_or_die(subrepo, oid, NULL);
 
 		grep_read_lock();
-		data = read_object_with_reference(&subrepo,
+		data = read_object_with_reference(subrepo,
 						  &object->oid, tree_type,
 						  &size, NULL);
 		grep_read_unlock();
@@ -478,7 +508,13 @@ static int grep_submodule(struct grep_opt *opt,
 		hit = grep_cache(&subopt, pathspec, cached);
 	}
 
-	repo_clear(&subrepo);
+	grep_lock();
+	p->all_work_added = 1;
+	if (p->work_counter == 0) {
+		repo_clear(&p->subrepo);
+		free(p);
+	}
+	grep_unlock();
 	return hit;
 }
 
@@ -587,7 +623,7 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
 			void *data;
 			unsigned long size;
 
-			data = lock_and_read_oid_file(&entry.oid, &type, &size);
+			data = lock_and_read_oid_file(repo, &entry.oid, &type, &size);
 			if (!data)
 				die(_("unable to read tree (%s)"),
 				    oid_to_hex(&entry.oid));
diff --git a/grep.c b/grep.c
index cd952ef5d3..0b3f38aaae 100644
--- a/grep.c
+++ b/grep.c
@@ -10,9 +10,8 @@
 #include "quote.h"
 #include "help.h"
 
-static int grep_source_load(struct grep_source *gs);
-static int grep_source_is_binary(struct grep_source *gs,
-				 struct index_state *istate);
+static int grep_source_load(struct repository *r, struct grep_source *gs);
+static int grep_source_is_binary(struct repository *r, struct grep_source *gs);
 
 static struct grep_opt grep_defaults;
 
@@ -1714,7 +1713,7 @@ static int fill_textconv_grep(struct repository *r,
 	size_t size;
 
 	if (!driver || !driver->textconv)
-		return grep_source_load(gs);
+		return grep_source_load(r, gs);
 
 	/*
 	 * The textconv interface is intimately tied to diff_filespecs, so we
@@ -1820,11 +1819,11 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
 	if (!textconv) {
 		switch (opt->binary) {
 		case GREP_BINARY_DEFAULT:
-			if (grep_source_is_binary(gs, opt->repo->index))
+			if (grep_source_is_binary(opt->repo, gs))
 				binary_match_only = 1;
 			break;
 		case GREP_BINARY_NOMATCH:
-			if (grep_source_is_binary(gs, opt->repo->index))
+			if (grep_source_is_binary(opt->repo, gs))
 				return 0; /* Assume unmatch */
 			break;
 		case GREP_BINARY_TEXT:
@@ -2105,12 +2104,12 @@ void grep_source_clear_data(struct grep_source *gs)
 	}
 }
 
-static int grep_source_load_oid(struct grep_source *gs)
+static int grep_source_load_oid(struct repository *r, struct grep_source *gs)
 {
 	enum object_type type;
 
 	grep_read_lock();
-	gs->buf = read_object_file(gs->identifier, &type, &gs->size);
+	gs->buf = repo_read_object_file(r, gs->identifier, &type, &gs->size);
 	grep_read_unlock();
 
 	if (!gs->buf)
@@ -2154,7 +2153,7 @@ static int grep_source_load_file(struct grep_source *gs)
 	return 0;
 }
 
-static int grep_source_load(struct grep_source *gs)
+static int grep_source_load(struct repository *r, struct grep_source *gs)
 {
 	if (gs->buf)
 		return 0;
@@ -2163,7 +2162,7 @@ static int grep_source_load(struct grep_source *gs)
 	case GREP_SOURCE_FILE:
 		return grep_source_load_file(gs);
 	case GREP_SOURCE_OID:
-		return grep_source_load_oid(gs);
+		return grep_source_load_oid(r, gs);
 	case GREP_SOURCE_BUF:
 		return gs->buf ? 0 : -1;
 	}
@@ -2184,14 +2183,13 @@ void grep_source_load_driver(struct grep_source *gs,
 	grep_attr_unlock();
 }
 
-static int grep_source_is_binary(struct grep_source *gs,
-				 struct index_state *istate)
+static int grep_source_is_binary(struct repository *r, struct grep_source *gs)
 {
-	grep_source_load_driver(gs, istate);
+	grep_source_load_driver(gs, r->index);
 	if (gs->driver->binary != -1)
 		return gs->driver->binary;
 
-	if (!grep_source_load(gs))
+	if (!grep_source_load(r, gs))
 		return buffer_is_binary(gs->buf, gs->size);
 
 	return 0;
-- 
2.23.0


  parent reply	other threads:[~2019-09-18  1:56 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-09-18  1:56 [RFC PATCH 0/3] grep: don't add subrepos to in-memory alternates Matheus Tavares
2019-09-18  1:56 ` [RFC PATCH 1/3] diff: use the given repo at diff_populate_filespec() Matheus Tavares
2019-09-18  1:56 ` [RFC PATCH 2/3] object: allow parse_object_or_die() to handle any repo Matheus Tavares
2019-09-18  1:56 ` Matheus Tavares [this message]
2019-09-21 21:58   ` [RFC PATCH 3/3] grep: don't add submodules to the alternates list Brandon Williams
2019-09-18 19:55 ` [RFC PATCH 0/3] grep: don't add subrepos to in-memory alternates Junio C Hamano
2019-09-19  5:18   ` Matheus Tavares Bernardino
2019-09-20 16:26     ` Junio C Hamano
2019-09-21 20:34       ` Matheus Tavares Bernardino
2019-09-28  3:24         ` Junio C Hamano
2019-09-28  4:20           ` Matheus Tavares Bernardino

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=fc5f5fe95c10ac66d1234a025122c8866965021f.1568771073.git.matheus.bernardino@usp.br \
    --to=matheus.bernardino@usp.br \
    --cc=bwilliams.eng@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=jrnieder@gmail.com \
    --cc=pclouds@gmail.com \
    --cc=sandals@crustytoothpaste.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).