From: Christian Couder <christian.couder@gmail.com>
To: git@vger.kernel.org
Cc: Junio C Hamano <gitster@pobox.com>, Jeff King <peff@peff.net>,
Christian Couder <chriscool@tuxfamily.org>,
Ramsay Jones <ramsay@ramsayjones.plus.com>
Subject: [RFC PATCH 10/10] pack-objects: improve partial packfile reuse
Date: Fri, 13 Sep 2019 15:02:26 +0200 [thread overview]
Message-ID: <20190913130226.7449-11-chriscool@tuxfamily.org> (raw)
In-Reply-To: <20190913130226.7449-1-chriscool@tuxfamily.org>
From: Jeff King <peff@peff.net>
Let's store the chunks of the packfile that we reuse in
a dynamic array of `struct reused_chunk`, and let's use
a reuse_packfile_bitmap to speed up reusing parts of
packfiles.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
---
builtin/pack-objects.c | 227 +++++++++++++++++++++++++++++++++--------
pack-bitmap.c | 150 +++++++++++++++++++--------
pack-bitmap.h | 3 +-
3 files changed, 293 insertions(+), 87 deletions(-)
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5ee0b3092d..6822ac1026 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -92,7 +92,7 @@ static struct progress *progress_state;
static struct packed_git *reuse_packfile;
static uint32_t reuse_packfile_objects;
-static off_t reuse_packfile_offset;
+static struct bitmap *reuse_packfile_bitmap;
static int use_bitmap_index_default = 1;
static int use_bitmap_index = -1;
@@ -785,57 +785,189 @@ static struct object_entry **compute_write_order(void)
return wo;
}
-static off_t write_reused_pack(struct hashfile *f)
+/*
+ * Record the offsets needed in our reused packfile chunks due to
+ * "gaps" where we omitted some objects.
+ */
+static struct reused_chunk {
+ off_t start;
+ off_t offset;
+} *reused_chunks;
+static int reused_chunks_nr;
+static int reused_chunks_alloc;
+
+static void record_reused_object(off_t where, off_t offset)
{
- unsigned char buffer[8192];
- off_t to_write, total;
- int fd;
+ if (reused_chunks_nr && reused_chunks[reused_chunks_nr-1].offset == offset)
+ return;
- if (!is_pack_valid(reuse_packfile))
- die(_("packfile is invalid: %s"), reuse_packfile->pack_name);
+ ALLOC_GROW(reused_chunks, reused_chunks_nr + 1,
+ reused_chunks_alloc);
+ reused_chunks[reused_chunks_nr].start = where;
+ reused_chunks[reused_chunks_nr].offset = offset;
+ reused_chunks_nr++;
+}
- fd = git_open(reuse_packfile->pack_name);
- if (fd < 0)
- die_errno(_("unable to open packfile for reuse: %s"),
- reuse_packfile->pack_name);
+/*
+ * Binary search to find the chunk that "where" is in. Note
+ * that we're not looking for an exact match, just the first
+ * chunk that contains it (which implicitly ends at the start
+ * of the next chunk.
+ */
+static off_t find_reused_offset(off_t where)
+{
+ int lo = 0, hi = reused_chunks_nr;
+ while (lo < hi) {
+ int mi = lo + ((hi - lo) / 2);
+ if (where == reused_chunks[mi].start)
+ return reused_chunks[mi].offset;
+ if (where < reused_chunks[mi].start)
+ hi = mi;
+ else
+ lo = mi + 1;
+ }
- if (lseek(fd, sizeof(struct pack_header), SEEK_SET) == -1)
- die_errno(_("unable to seek in reused packfile"));
+ /*
+ * The first chunk starts at zero, so we can't have gone below
+ * there.
+ */
+ assert(lo);
+ return reused_chunks[lo-1].offset;
+}
- if (reuse_packfile_offset < 0)
- reuse_packfile_offset = reuse_packfile->pack_size - the_hash_algo->rawsz;
+static void write_reused_pack_one(size_t pos, struct hashfile *out,
+ struct pack_window **w_curs)
+{
+ off_t offset, next, cur;
+ enum object_type type;
+ unsigned long size;
- total = to_write = reuse_packfile_offset - sizeof(struct pack_header);
+ offset = reuse_packfile->revindex[pos].offset;
+ next = reuse_packfile->revindex[pos + 1].offset;
- while (to_write) {
- int read_pack = xread(fd, buffer, sizeof(buffer));
+ record_reused_object(offset, offset - hashfile_total(out));
- if (read_pack <= 0)
- die_errno(_("unable to read from reused packfile"));
+ cur = offset;
+ type = unpack_object_header(reuse_packfile, w_curs, &cur, &size);
+ assert(type >= 0);
- if (read_pack > to_write)
- read_pack = to_write;
+ if (type == OBJ_OFS_DELTA) {
+ off_t base_offset;
+ off_t fixup;
+
+ unsigned char header[MAX_PACK_OBJECT_HEADER];
+ unsigned len;
+
+ base_offset = get_delta_base(reuse_packfile, w_curs, &cur, type, offset);
+ assert(base_offset != 0);
+
+ /* Convert to REF_DELTA if we must... */
+ if (!allow_ofs_delta) {
+ int base_pos = find_revindex_position(reuse_packfile, base_offset);
+ const unsigned char *base_sha1 =
+ nth_packed_object_sha1(reuse_packfile,
+ reuse_packfile->revindex[base_pos].nr);
+
+ len = encode_in_pack_object_header(header, sizeof(header),
+ OBJ_REF_DELTA, size);
+ hashwrite(out, header, len);
+ hashwrite(out, base_sha1, 20);
+ copy_pack_data(out, reuse_packfile, w_curs, cur, next - cur);
+ return;
+ }
- hashwrite(f, buffer, read_pack);
- to_write -= read_pack;
+ /* Otherwise see if we need to rewrite the offset... */
+ fixup = find_reused_offset(offset) -
+ find_reused_offset(base_offset);
+ if (fixup) {
+ unsigned char ofs_header[10];
+ unsigned i, ofs_len;
+ off_t ofs = offset - base_offset - fixup;
+
+ len = encode_in_pack_object_header(header, sizeof(header),
+ OBJ_OFS_DELTA, size);
+
+ i = sizeof(ofs_header) - 1;
+ ofs_header[i] = ofs & 127;
+ while (ofs >>= 7)
+ ofs_header[--i] = 128 | (--ofs & 127);
+
+ ofs_len = sizeof(ofs_header) - i;
+
+ if (0) {
+ off_t expected_size = cur - offset;
+
+ if (len + ofs_len < expected_size) {
+ unsigned max_pad = (len >= 4) ? 9 : 5;
+ header[len - 1] |= 0x80;
+ while (len < max_pad && len + ofs_len < expected_size)
+ header[len++] = 0x80;
+ header[len - 1] &= 0x7F;
+ }
+ }
+
+ hashwrite(out, header, len);
+ hashwrite(out, ofs_header + sizeof(ofs_header) - ofs_len, ofs_len);
+ copy_pack_data(out, reuse_packfile, w_curs, cur, next - cur);
+ return;
+ }
+
+ /* ...otherwise we have no fixup, and can write it verbatim */
+ }
+
+ copy_pack_data(out, reuse_packfile, w_curs, offset, next - offset);
+}
+
+static size_t write_reused_pack_verbatim(struct hashfile *out,
+ struct pack_window **w_curs)
+{
+ size_t pos = 0;
+
+ while (pos < reuse_packfile_bitmap->word_alloc &&
+ reuse_packfile_bitmap->words[pos] == (eword_t)~0)
+ pos++;
+
+ if (pos) {
+ off_t to_write;
+
+ written = (pos * BITS_IN_EWORD);
+ to_write = reuse_packfile->revindex[written].offset
+ - sizeof(struct pack_header);
+
+ record_reused_object(sizeof(struct pack_header), 0);
+ hashflush(out);
+ copy_pack_data(out, reuse_packfile, w_curs,
+ sizeof(struct pack_header), to_write);
- /*
- * We don't know the actual number of objects written,
- * only how many bytes written, how many bytes total, and
- * how many objects total. So we can fake it by pretending all
- * objects we are writing are the same size. This gives us a
- * smooth progress meter, and at the end it matches the true
- * answer.
- */
- written = reuse_packfile_objects *
- (((double)(total - to_write)) / total);
display_progress(progress_state, written);
}
+ return pos;
+}
+
+static void write_reused_pack(struct hashfile *f)
+{
+ size_t i = 0;
+ uint32_t offset;
+ struct pack_window *w_curs = NULL;
+
+ if (allow_ofs_delta)
+ i = write_reused_pack_verbatim(f, &w_curs);
+
+ for (; i < reuse_packfile_bitmap->word_alloc; ++i) {
+ eword_t word = reuse_packfile_bitmap->words[i];
+ size_t pos = (i * BITS_IN_EWORD);
+
+ for (offset = 0; offset < BITS_IN_EWORD; ++offset) {
+ if ((word >> offset) == 0)
+ break;
+
+ offset += ewah_bit_ctz64(word >> offset);
+ write_reused_pack_one(pos + offset, f, &w_curs);
+ display_progress(progress_state, ++written);
+ }
+ }
- close(fd);
- written = reuse_packfile_objects;
- display_progress(progress_state, written);
- return reuse_packfile_offset - sizeof(struct pack_header);
+ unuse_pack(&w_curs);
}
static const char no_split_warning[] = N_(
@@ -868,11 +1000,9 @@ static void write_pack_file(void)
offset = write_pack_header(f, nr_remaining);
if (reuse_packfile) {
- off_t packfile_size;
assert(pack_to_stdout);
-
- packfile_size = write_reused_pack(f);
- offset += packfile_size;
+ write_reused_pack(f);
+ offset = hashfile_total(f);
}
nr_written = 0;
@@ -1002,6 +1132,10 @@ static int have_duplicate_entry(const struct object_id *oid,
{
struct object_entry *entry;
+ if (reuse_packfile_bitmap &&
+ bitmap_walk_contains(bitmap_git, reuse_packfile_bitmap, oid))
+ return 1;
+
entry = packlist_find(&to_pack, oid, index_pos);
if (!entry)
return 0;
@@ -1192,6 +1326,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
create_object_entry(oid, type, pack_name_hash(name),
exclude, name && no_try_delta(name),
index_pos, found_pack, found_offset);
+
return 1;
}
@@ -2571,7 +2706,9 @@ static void ll_find_deltas(struct object_entry **list, unsigned list_size,
static int obj_is_packed(const struct object_id *oid)
{
- return !!packlist_find(&to_pack, oid, NULL);
+ return packlist_find(&to_pack, oid, NULL) ||
+ (reuse_packfile_bitmap &&
+ bitmap_walk_contains(bitmap_git, reuse_packfile_bitmap, oid));
}
static void add_tag_chain(const struct object_id *oid)
@@ -2677,6 +2814,7 @@ static void prepare_pack(int window, int depth)
if (nr_deltas && n > 1) {
unsigned nr_done = 0;
+
if (progress)
progress_state = start_progress(_("Compressing objects"),
nr_deltas);
@@ -3061,7 +3199,6 @@ static void loosen_unused_packed_objects(void)
static int pack_options_allow_reuse(void)
{
return pack_to_stdout &&
- allow_ofs_delta &&
!ignore_packed_keep_on_disk &&
!ignore_packed_keep_in_core &&
(!local || !have_non_local_packs) &&
@@ -3079,7 +3216,7 @@ static int get_object_list_from_bitmap(struct rev_info *revs)
bitmap_git,
&reuse_packfile,
&reuse_packfile_objects,
- &reuse_packfile_offset)) {
+ &reuse_packfile_bitmap)) {
assert(reuse_packfile_objects);
nr_result += reuse_packfile_objects;
display_progress(progress_state, nr_result);
diff --git a/pack-bitmap.c b/pack-bitmap.c
index 1833971dc7..1d4c95ebc1 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -326,6 +326,13 @@ static int load_pack_bitmap(struct bitmap_index *bitmap_git)
munmap(bitmap_git->map, bitmap_git->map_size);
bitmap_git->map = NULL;
bitmap_git->map_size = 0;
+
+ kh_destroy_oid_map(bitmap_git->bitmaps);
+ bitmap_git->bitmaps = NULL;
+
+ kh_destroy_oid_pos(bitmap_git->ext_index.positions);
+ bitmap_git->ext_index.positions = NULL;
+
return -1;
}
@@ -766,65 +773,126 @@ struct bitmap_index *prepare_bitmap_walk(struct rev_info *revs)
return NULL;
}
-int reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
- struct packed_git **packfile,
- uint32_t *entries,
- off_t *up_to)
+static void try_partial_reuse(struct bitmap_index *bitmap_git,
+ size_t pos,
+ struct bitmap *reuse,
+ struct pack_window **w_curs)
{
+ struct revindex_entry *revidx;
+ off_t offset;
+ enum object_type type;
+ unsigned long size;
+
+ if (pos >= bitmap_git->pack->num_objects)
+ return; /* not actually in the pack */
+
+ revidx = &bitmap_git->pack->revindex[pos];
+ offset = revidx->offset;
+ type = unpack_object_header(bitmap_git->pack, w_curs, &offset, &size);
+ if (type < 0)
+ return; /* broken packfile, punt */
+
+ if (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA) {
+ off_t base_offset;
+ int base_pos;
+
+ /*
+ * Find the position of the base object so we can look it up
+ * in our bitmaps. If we can't come up with an offset, or if
+ * that offset is not in the revidx, the pack is corrupt.
+ * There's nothing we can do, so just punt on this object,
+ * and the normal slow path will complain about it in
+ * more detail.
+ */
+ base_offset = get_delta_base(bitmap_git->pack, w_curs,
+ &offset, type, revidx->offset);
+ if (!base_offset)
+ return;
+ base_pos = find_revindex_position(bitmap_git->pack, base_offset);
+ if (base_pos < 0)
+ return;
+
+ /*
+ * We assume delta dependencies always point backwards. This
+ * lets us do a single pass, and is basically always true
+ * due to the way OFS_DELTAs work. You would not typically
+ * find REF_DELTA in a bitmapped pack, since we only bitmap
+ * packs we write fresh, and OFS_DELTA is the default). But
+ * let's double check to make sure the pack wasn't written with
+ * odd parameters.
+ */
+ if (base_pos >= pos)
+ return;
+
+ /*
+ * And finally, if we're not sending the base as part of our
+ * reuse chunk, then don't send this object either. The base
+ * would come after us, along with other objects not
+ * necessarily in the pack, which means we'd need to convert
+ * to REF_DELTA on the fly. Better to just let the normal
+ * object_entry code path handle it.
+ */
+ if (!bitmap_get(reuse, base_pos))
+ return;
+ }
+
/*
- * Reuse the packfile content if we need more than
- * 90% of its objects
+ * If we got here, then the object is OK to reuse. Mark it.
*/
- static const double REUSE_PERCENT = 0.9;
+ bitmap_set(reuse, pos);
+}
+int reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
+ struct packed_git **packfile_out,
+ uint32_t *entries,
+ struct bitmap **reuse_out)
+{
struct bitmap *result = bitmap_git->result;
- uint32_t reuse_threshold;
- uint32_t i, reuse_objects = 0;
+ struct bitmap *reuse;
+ struct pack_window *w_curs = NULL;
+ size_t i = 0;
+ uint32_t offset;
assert(result);
- for (i = 0; i < result->word_alloc; ++i) {
- if (result->words[i] != (eword_t)~0) {
- reuse_objects += ewah_bit_ctz64(~result->words[i]);
- break;
- }
-
- reuse_objects += BITS_IN_EWORD;
- }
+ while (i < result->word_alloc && result->words[i] == (eword_t)~0)
+ i++;
-#ifdef GIT_BITMAP_DEBUG
- {
- const unsigned char *sha1;
- struct revindex_entry *entry;
+ /* Don't mark objects not in the packfile */
+ if (i > bitmap_git->pack->num_objects / BITS_IN_EWORD)
+ i = bitmap_git->pack->num_objects / BITS_IN_EWORD;
- entry = &bitmap_git->reverse_index->revindex[reuse_objects];
- sha1 = nth_packed_object_sha1(bitmap_git->pack, entry->nr);
+ reuse = bitmap_word_alloc(i);
+ memset(reuse->words, 0xFF, i * sizeof(eword_t));
- fprintf(stderr, "Failed to reuse at %d (%016llx)\n",
- reuse_objects, result->words[i]);
- fprintf(stderr, " %s\n", hash_to_hex(sha1));
- }
-#endif
+ for (; i < result->word_alloc; ++i) {
+ eword_t word = result->words[i];
+ size_t pos = (i * BITS_IN_EWORD);
- if (!reuse_objects)
- return -1;
+ for (offset = 0; offset < BITS_IN_EWORD; ++offset) {
+ if ((word >> offset) == 0)
+ break;
- if (reuse_objects >= bitmap_git->pack->num_objects) {
- bitmap_git->reuse_objects = *entries = bitmap_git->pack->num_objects;
- *up_to = -1; /* reuse the full pack */
- *packfile = bitmap_git->pack;
- return 0;
+ offset += ewah_bit_ctz64(word >> offset);
+ try_partial_reuse(bitmap_git, pos + offset, reuse, &w_curs);
+ }
}
- reuse_threshold = bitmap_popcount(bitmap_git->result) * REUSE_PERCENT;
+ unuse_pack(&w_curs);
- if (reuse_objects < reuse_threshold)
+ *entries = bitmap_popcount(reuse);
+ if (!*entries) {
+ bitmap_free(reuse);
return -1;
+ }
- bitmap_git->reuse_objects = *entries = reuse_objects;
- *up_to = bitmap_git->pack->revindex[reuse_objects].offset;
- *packfile = bitmap_git->pack;
-
+ /*
+ * Drop any reused objects from the result, since they will not
+ * need to be handled separately.
+ */
+ bitmap_and_not(result, reuse);
+ *packfile_out = bitmap_git->pack;
+ *reuse_out = reuse;
return 0;
}
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 5425767f0f..7af7335f2e 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -50,7 +50,8 @@ void test_bitmap_walk(struct rev_info *revs);
struct bitmap_index *prepare_bitmap_walk(struct rev_info *revs);
int reuse_partial_packfile_from_bitmap(struct bitmap_index *,
struct packed_git **packfile,
- uint32_t *entries, off_t *up_to);
+ uint32_t *entries,
+ struct bitmap **bitmap);
int rebuild_existing_bitmaps(struct bitmap_index *, struct packing_data *mapping,
kh_oid_map_t *reused_bitmaps, int show_progress);
void free_bitmap_index(struct bitmap_index *);
--
2.23.0.46.gd213b4aca1.dirty
next prev parent reply other threads:[~2019-09-13 13:03 UTC|newest]
Thread overview: 31+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-09-13 13:02 [RFC PATCH 00/10] Rewrite packfile reuse code Christian Couder
2019-09-13 13:02 ` [RFC PATCH 01/10] builtin/pack-objects: report reused packfile objects Christian Couder
2019-09-13 13:02 ` [RFC PATCH 02/10] packfile: expose get_delta_base() Christian Couder
2019-09-13 13:02 ` [RFC PATCH 03/10] ewah/bitmap: introduce bitmap_word_alloc() Christian Couder
2019-09-13 13:02 ` [RFC PATCH 04/10] ewah/bitmap: always allocate 2 more words Christian Couder
2019-10-10 23:40 ` Jonathan Tan
2019-10-11 7:49 ` Christian Couder
2019-10-11 18:05 ` Jeff King
2019-09-13 13:02 ` [RFC PATCH 05/10] pack-bitmap: don't rely on bitmap_git->reuse_objects Christian Couder
2019-10-10 23:44 ` Jonathan Tan
2019-10-11 7:50 ` Christian Couder
2019-09-13 13:02 ` [RFC PATCH 06/10] pack-bitmap: introduce bitmap_walk_contains() Christian Couder
2019-09-13 13:02 ` [RFC PATCH 07/10] csum-file: introduce hashfile_total() Christian Couder
2019-09-13 13:02 ` [RFC PATCH 08/10] pack-objects: introduce pack.allowPackReuse Christian Couder
2019-09-13 21:37 ` Junio C Hamano
2019-09-13 13:02 ` [RFC PATCH 09/10] builtin/pack-objects: introduce obj_is_packed() Christian Couder
2019-09-13 13:02 ` Christian Couder [this message]
2019-09-13 22:29 ` [RFC PATCH 10/10] pack-objects: improve partial packfile reuse Junio C Hamano
2019-09-14 2:02 ` Jeff King
2019-09-14 3:06 ` Junio C Hamano
2019-10-02 15:57 ` Jeff King
2019-10-03 2:06 ` Junio C Hamano
2019-10-03 6:55 ` Christian Couder
2019-10-10 23:59 ` Jonathan Tan
2019-10-11 7:39 ` Christian Couder
2019-10-11 18:01 ` Jeff King
2019-10-11 21:04 ` Jonathan Tan
2019-10-12 0:04 ` Junio C Hamano
2019-10-13 7:38 ` Jeff King
2019-10-17 7:03 ` Junio C Hamano
2019-10-17 7:23 ` Jeff King
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://vger.kernel.org/majordomo-info.html
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190913130226.7449-11-chriscool@tuxfamily.org \
--to=christian.couder@gmail.com \
--cc=chriscool@tuxfamily.org \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
--cc=peff@peff.net \
--cc=ramsay@ramsayjones.plus.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/mirrors/git.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).