git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: Nicolas Pitre <nico@cam.org>
To: "Shawn O. Pearce" <spearce@spearce.org>
Cc: Jon Smirl <jonsmirl@gmail.com>,
	Martin Koegler <mkoegler@auto.tuwien.ac.at>,
	Git Mailing List <git@vger.kernel.org>
Subject: Re: performance on repack
Date: Thu, 30 Aug 2007 00:27:50 -0400 (EDT)	[thread overview]
Message-ID: <alpine.LFD.0.999.0708300005110.16727@xanadu.home> (raw)
In-Reply-To: <alpine.LFD.0.999.0708151500510.5415@xanadu.home>

On Wed, 15 Aug 2007, Nicolas Pitre wrote:

> On Wed, 15 Aug 2007, Shawn O. Pearce wrote:
> 
> > Nicolas Pitre <nico@cam.org> wrote:
> >
> > > Remains the approach of calling find_deltas() n times with 1/n of the 
> > > delta_list, one call per thread, for the bulk of the delta search work.  
> > > This might even be much simpler than my current patch is.  However this 
> > > approach will require n times the memory for the delta window data.  
> > > Thread creation overhead will occur only once.
> > 
> > Yea, that I agree with.  The other thing is the main thread may need
> > to push a couple of windows worth of work into the threads, so that
> > if this window's 1/n unit goes really fast on that thread it doesn't
> > stall out waiting for the main thread to get it more data.
> 
> It is easier to keep 4 threads busy with 100000 objects each to deal 
> with in a conventional way than keeping 10 threads busy when processing 
> a single delta window in parallel.  Or the delta_list can be partitioned 
> in smaller chunks and anytime one of the find_deltas() thread is done 
> then it is fed another chunk.
> 
> The side effect of that is that no delta will cross chunk boundaries.  
> But just like when repacking multiple existing packs into one with 
> delta data reuse, the actual chunk boundaries can be re-fed to 
> find_deltas() in order to find more deltas across chunks.

Well, here is a quick implementation of this idea for those who would 
like to give it a try.  Performance is indeed much better.  The delta 
progress status is however fscked up at the moment, so don't pay too 
much attention to it.  Partitioning is also extremely crude.  But the 
idea looks promizing.

---

diff --git a/Makefile b/Makefile
index 4eb4637..c3c6e68 100644
--- a/Makefile
+++ b/Makefile
@@ -122,6 +122,9 @@ all::
 # If not set it defaults to the bare 'wish'. If it is set to the empty
 # string then NO_TCLTK will be forced (this is used by configure script).
 #
+# Define THREADED_DELTA_SEARCH if you have pthreads and wish to exploit
+# parallel delta searching when packing objects.
+#
 
 GIT-VERSION-FILE: .FORCE-GIT-VERSION-FILE
 	@$(SHELL_PATH) ./GIT-VERSION-GEN
@@ -662,6 +665,11 @@ ifdef NO_HSTRERROR
 	COMPAT_OBJS += compat/hstrerror.o
 endif
 
+ifdef THREADED_DELTA_SEARCH
+	BASIC_CFLAGS += -DTHREADED_DELTA_SEARCH
+	EXTLIBS += -lpthread
+endif
+
 ifeq ($(TCLTK_PATH),)
 NO_TCLTK=NoThanks
 endif
diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index e917e8e..7d68f82 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -15,6 +15,10 @@
 #include "list-objects.h"
 #include "progress.h"
 
+#ifdef THREADED_DELTA_SEARCH
+#include <pthread.h>
+#endif
+
 static const char pack_usage[] = "\
 git-pack-objects [{ -q | --progress | --all-progress }] \n\
 	[--max-pack-size=N] [--local] [--incremental] \n\
@@ -1290,6 +1294,20 @@ static int delta_cacheable(unsigned long src_size, unsigned long trg_size,
 	return 0;
 }
 
+#ifdef THREADED_DELTA_SEARCH
+
+static pthread_mutex_t read_sha1_file_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+#define read_lock()		pthread_mutex_lock(&read_sha1_file_mutex)
+#define read_unlock()		pthread_mutex_unlock(&read_sha1_file_mutex)
+
+#else
+
+#define read_lock()		0
+#define read_unlock()		0
+
+#endif
+
 /*
  * We search for deltas _backwards_ in a list sorted by type and
  * by size, so that we see progressively smaller and smaller files.
@@ -1348,7 +1366,9 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 	/* Load data if not already done */
 	if (!trg->data) {
+		read_lock();
 		trg->data = read_sha1_file(trg_entry->idx.sha1, &type, &sz);
+		read_unlock();
 		if (!trg->data)
 			die("object %s cannot be read",
 			    sha1_to_hex(trg_entry->idx.sha1));
@@ -1358,7 +1378,9 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		*mem_usage += sz;
 	}
 	if (!src->data) {
+		read_lock();
 		src->data = read_sha1_file(src_entry->idx.sha1, &type, &sz);
+		read_unlock();
 		if (!src->data)
 			die("object %s cannot be read",
 			    sha1_to_hex(src_entry->idx.sha1));
@@ -1521,6 +1543,56 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
 	free(array);
 }
 
+#ifdef THREADED_DELTA_SEARCH
+
+struct thread_params {
+	pthread_t thread;
+	struct object_entry **list;
+	unsigned list_size;
+	unsigned nr_deltas;
+	int window;
+	int depth;
+};
+
+static void *threaded_find_deltas(void *arg)
+{
+	struct thread_params *p = arg;
+	if (p->list_size)
+		find_deltas(p->list, p->list_size, p->nr_deltas,
+			    p->window, p->depth);
+	return NULL;
+}
+
+static void ll_find_deltas(struct object_entry **list, unsigned list_size,
+			   unsigned nr_deltas, int window, int depth)
+{
+	struct thread_params p[4];
+	int i, ret;
+
+	for (i = 0; i < 4; i++) {
+		unsigned sublist_size = list_size / (4 - i);
+		p[i].list = list;
+		p[i].list_size = sublist_size;
+		p[i].nr_deltas = nr_deltas;
+		p[i].window = window;
+		p[i].depth = depth;
+		ret = pthread_create(&p[i].thread, NULL,
+				     threaded_find_deltas, &p[i]);
+		if (ret)
+			die("unable to create thread: %s", strerror(ret));
+		list += sublist_size;
+		list_size -= sublist_size;
+	}
+
+	for (i = 0; i < 4; i++) {
+		pthread_join(p[i].thread, NULL);
+	}
+}
+
+#else
+#define ll_find_deltas find_deltas
+#endif
+
 static void prepare_pack(int window, int depth)
 {
 	struct object_entry **delta_list;
@@ -1557,7 +1629,7 @@ static void prepare_pack(int window, int depth)
 
 	if (nr_deltas) {
 		qsort(delta_list, n, sizeof(*delta_list), type_size_sort);
-		find_deltas(delta_list, n, nr_deltas, window+1, depth);
+		ll_find_deltas(delta_list, n, nr_deltas, window+1, depth);
 	}
 	free(delta_list);
 }

  reply	other threads:[~2007-08-30  4:28 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-08-11 21:12 performance on repack Jon Smirl
2007-08-11 22:09 ` David Kastrup
2007-08-11 22:34   ` Linus Torvalds
2007-08-11 23:21     ` Jon Smirl
2007-08-12 10:33 ` Martin Koegler
2007-08-12 13:49   ` Jon Smirl
2007-08-14  3:12     ` Shawn O. Pearce
2007-08-14  4:10       ` Jon Smirl
2007-08-14  5:13         ` Shawn O. Pearce
2007-08-14  5:57           ` Jon Smirl
2007-08-14 14:52       ` Nicolas Pitre
2007-08-14 21:41       ` Nicolas Pitre
2007-08-15  1:20         ` Jon Smirl
2007-08-15  1:59           ` Nicolas Pitre
2007-08-15  5:32         ` Shawn O. Pearce
2007-08-15 15:08           ` Jon Smirl
2007-08-15 17:11             ` Martin Koegler
2007-08-15 18:38               ` Jon Smirl
2007-08-15 19:00                 ` Nicolas Pitre
2007-08-15 19:42                   ` Jon Smirl
2007-08-16  8:10                   ` David Kastrup
2007-08-16 15:34                     ` Nicolas Pitre
2007-08-16 16:13                       ` Jon Smirl
2007-08-16 16:21                         ` Nicolas Pitre
2007-08-15 21:05             ` Nicolas Pitre
2007-08-15 20:49           ` Nicolas Pitre
2007-08-30  4:27             ` Nicolas Pitre [this message]
2007-08-30  4:36               ` Nicolas Pitre
2007-08-30 16:17                 ` Jon Smirl
2007-09-01 21:54                 ` Jon Smirl

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=alpine.LFD.0.999.0708300005110.16727@xanadu.home \
    --to=nico@cam.org \
    --cc=git@vger.kernel.org \
    --cc=jonsmirl@gmail.com \
    --cc=mkoegler@auto.tuwien.ac.at \
    --cc=spearce@spearce.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).