From: Derrick Stolee <stolee@gmail.com>
To: git@vger.kernel.org
Cc: dstolee@microsoft.com, stolee@gmail.com, git@jeffhostetler.com,
peff@peff.net, gitster@pobox.com, Johannes.Shindelin@gmx.de,
jrnieder@gmail.com
Subject: [RFC PATCH 04/18] midx: write multi-pack indexes for an object list
Date: Sun, 7 Jan 2018 13:14:45 -0500 [thread overview]
Message-ID: <20180107181459.222909-5-dstolee@microsoft.com> (raw)
In-Reply-To: <20180107181459.222909-1-dstolee@microsoft.com>
The write_midx_file() method takes a list of packfiles and indexed
objects with offset information and writes according to the format
in Documentation/technical/pack-format.txt. The chunks are separated
into methods.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
Makefile | 1 +
midx.c | 412 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
midx.h | 42 +++++++
3 files changed, 455 insertions(+)
create mode 100644 midx.c
create mode 100644 midx.h
diff --git a/Makefile b/Makefile
index 2a81ae22e9..d0d810951f 100644
--- a/Makefile
+++ b/Makefile
@@ -827,6 +827,7 @@ LIB_OBJS += merge.o
LIB_OBJS += merge-blobs.o
LIB_OBJS += merge-recursive.o
LIB_OBJS += mergesort.o
+LIB_OBJS += midx.o
LIB_OBJS += mru.o
LIB_OBJS += name-hash.o
LIB_OBJS += notes.o
diff --git a/midx.c b/midx.c
new file mode 100644
index 0000000000..5c320726ed
--- /dev/null
+++ b/midx.c
@@ -0,0 +1,412 @@
+#include "cache.h"
+#include "git-compat-util.h"
+#include "pack.h"
+#include "packfile.h"
+#include "midx.h"
+
+#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
+#define MIDX_CHUNKID_PACKLOOKUP 0x504c4f4f /* "PLOO" */
+#define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
+#define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
+#define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
+#define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */
+#define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */
+
+#define MIDX_VERSION_1 1
+#define MIDX_VERSION MIDX_VERSION_1
+
+#define MIDX_OID_VERSION_SHA1 1
+#define MIDX_OID_LEN_SHA1 20
+#define MIDX_OID_VERSION MIDX_OID_VERSION_SHA1
+#define MIDX_OID_LEN MIDX_OID_LEN_SHA1
+
+#define MIDX_LARGE_OFFSET_NEEDED 0x80000000
+
+char* get_midx_filename_oid(const char *pack_dir,
+ struct object_id *oid)
+{
+ struct strbuf head_path = STRBUF_INIT;
+ strbuf_addstr(&head_path, pack_dir);
+ strbuf_addstr(&head_path, "/midx-");
+ strbuf_addstr(&head_path, oid_to_hex(oid));
+ strbuf_addstr(&head_path, ".midx");
+
+ return strbuf_detach(&head_path, NULL);
+}
+
+struct pack_midx_details_internal {
+ uint32_t pack_int_id;
+ uint32_t internal_offset;
+};
+
+static int midx_sha1_compare(const void *_a, const void *_b)
+{
+ struct pack_midx_entry *a = *(struct pack_midx_entry **)_a;
+ struct pack_midx_entry *b = *(struct pack_midx_entry **)_b;
+ return oidcmp(&a->oid, &b->oid);
+}
+
+static void write_midx_chunk_packlookup(
+ struct sha1file *f,
+ const char **pack_names, uint32_t nr_packs)
+{
+ uint32_t i, cur_len = 0;
+
+ for (i = 0; i < nr_packs; i++) {
+ uint32_t swap_len = htonl(cur_len);
+ sha1write(f, &swap_len, 4);
+ cur_len += strlen(pack_names[i]) + 1;
+ }
+}
+
+static void write_midx_chunk_packnames(
+ struct sha1file *f,
+ const char **pack_names, uint32_t nr_packs)
+{
+ uint32_t i;
+ for (i = 0; i < nr_packs; i++)
+ sha1write(f, pack_names[i], strlen(pack_names[i]) + 1);
+}
+
+static void write_midx_chunk_oidfanout(
+ struct sha1file *f,
+ struct pack_midx_entry **objects, uint32_t nr_objects)
+{
+ struct pack_midx_entry **list = objects;
+ struct pack_midx_entry **last = objects + nr_objects;
+ uint32_t count_distinct = 0;
+ uint32_t i;
+
+ /*
+ * Write the first-level table (the list is sorted,
+ * but we use a 256-entry lookup to be able to avoid
+ * having to do eight extra binary search iterations).
+ */
+ for (i = 0; i < 256; i++) {
+ struct pack_midx_entry **next = list;
+ struct pack_midx_entry *prev = 0;
+ uint32_t swap_distinct;
+
+ while (next < last) {
+ struct pack_midx_entry *obj = *next;
+ if (obj->oid.hash[0] != i)
+ break;
+
+ if (!prev || oidcmp(&(prev->oid), &(obj->oid)))
+ count_distinct++;
+
+ prev = obj;
+ next++;
+ }
+
+ swap_distinct = htonl(count_distinct);
+ sha1write(f, &swap_distinct, 4);
+ list = next;
+ }
+}
+
+static void write_midx_chunk_oidlookup(
+ struct sha1file *f, unsigned char hash_len,
+ struct pack_midx_entry **objects, uint32_t nr_objects)
+{
+ struct pack_midx_entry **list = objects;
+ struct object_id *last_oid = 0;
+ uint32_t i;
+
+ for (i = 0; i < nr_objects; i++) {
+ struct pack_midx_entry *obj = *list++;
+
+ if (last_oid && !oidcmp(last_oid, &obj->oid))
+ continue;
+
+ last_oid = &obj->oid;
+ sha1write(f, obj->oid.hash, (int)hash_len);
+ }
+}
+
+static void write_midx_chunk_objectoffsets(
+ struct sha1file *f, int large_offset_needed,
+ struct pack_midx_entry **objects, uint32_t nr_objects, uint32_t *pack_perm)
+{
+ struct pack_midx_entry **list = objects;
+ struct object_id *last_oid = 0;
+ uint32_t i, nr_large_offset = 0;
+
+ for (i = 0; i < nr_objects; i++) {
+ struct pack_midx_details_internal details;
+ struct pack_midx_entry *obj = *list++;
+
+ if (last_oid && !oidcmp(last_oid, &obj->oid))
+ continue;
+
+ last_oid = &obj->oid;
+
+ details.pack_int_id = htonl(pack_perm[obj->pack_int_id]);
+
+ if (large_offset_needed && obj->offset >> 31)
+ details.internal_offset = (MIDX_LARGE_OFFSET_NEEDED | nr_large_offset++);
+ else
+ details.internal_offset = (uint32_t)obj->offset;
+
+ details.internal_offset = htonl(details.internal_offset);
+ sha1write(f, &details, 8);
+ }
+}
+
+static void write_midx_chunk_largeoffsets(
+ struct sha1file *f, uint32_t nr_large_offset,
+ struct pack_midx_entry **objects, uint32_t nr_objects)
+{
+ struct pack_midx_entry **list = objects;
+ struct object_id *last_oid = 0;
+
+ while (nr_large_offset) {
+ struct pack_midx_entry *obj = *list++;
+ uint64_t offset = obj->offset;
+ uint32_t split[2];
+
+ if (last_oid && !oidcmp(last_oid, &obj->oid))
+ continue;
+
+ last_oid = &obj->oid;
+
+ if (!(offset >> 31))
+ continue;
+
+ split[0] = htonl(offset >> 32);
+ split[1] = htonl(offset & 0xffffffff);
+
+ sha1write(f, split, 8);
+ nr_large_offset--;
+ }
+}
+
+struct pack_pair {
+ uint32_t pack_int_id;
+ const char *pack_name;
+};
+
+static int pack_pair_compare(const void *_a, const void *_b)
+{
+ struct pack_pair *a = (struct pack_pair *)_a;
+ struct pack_pair *b = (struct pack_pair *)_b;
+ return strcmp(a->pack_name, b->pack_name);
+}
+
+static void sort_packs_by_name(const char **pack_names, uint32_t nr_packs, uint32_t *perm)
+{
+ uint32_t i;
+ struct pack_pair *pairs;
+
+ ALLOC_ARRAY(pairs, nr_packs);
+
+ for (i = 0; i < nr_packs; i++) {
+ pairs[i].pack_int_id = i;
+ pairs[i].pack_name = pack_names[i];
+ }
+
+ QSORT(pairs, nr_packs, pack_pair_compare);
+
+ for (i = 0; i < nr_packs; i++) {
+ pack_names[i] = pairs[i].pack_name;
+ perm[pairs[i].pack_int_id] = i;
+ }
+}
+
+const char *write_midx_file(const char *pack_dir,
+ const char *midx_name,
+ const char **pack_names,
+ uint32_t nr_packs,
+ struct pack_midx_entry **objects,
+ uint32_t nr_objects)
+{
+ struct sha1file *f;
+ struct pack_midx_entry **sorted_by_sha;
+ int i, chunk, fd;
+ struct pack_midx_header hdr;
+ uint32_t chunk_ids[7];
+ uint64_t chunk_offsets[7];
+ unsigned char large_offset_needed = 0;
+ unsigned int nr_large_offset = 0;
+ unsigned char final_hash[GIT_MAX_RAWSZ];
+ const char *final_hex;
+ int rename_needed = 0;
+ uint32_t count_distinct = 0;
+ int total_name_len = 0;
+ uint32_t *pack_perm;
+
+ if (!core_midx)
+ return 0;
+
+ /* Sort packs */
+ if (nr_packs) {
+ ALLOC_ARRAY(pack_perm, nr_packs);
+ sort_packs_by_name(pack_names, nr_packs, pack_perm);
+ } else {
+ pack_perm = 0;
+ }
+
+ /* Sort objects */
+ if (nr_objects) {
+ sorted_by_sha = objects;
+
+ QSORT(sorted_by_sha, nr_objects, midx_sha1_compare);
+
+ for (i = 0; i < nr_objects; i++) {
+ if (i &&
+ !oidcmp(&sorted_by_sha[i-1]->oid, &sorted_by_sha[i]->oid))
+ continue;
+
+ count_distinct++;
+
+ if (sorted_by_sha[i]->offset > 0x7fffffff)
+ nr_large_offset++;
+ if (sorted_by_sha[i]->offset > 0xffffffff)
+ large_offset_needed = 1;
+ }
+ } else {
+ sorted_by_sha = NULL;
+ }
+
+ for (i = 0; i < nr_packs; i++)
+ total_name_len += strlen(pack_names[i]) + 1;
+
+ /* open temp file, or direct file if given */
+ if (!midx_name) {
+ struct strbuf tmp_file = STRBUF_INIT;
+ strbuf_addstr(&tmp_file, pack_dir);
+ strbuf_addstr(&tmp_file, "/tmp_midx_XXXXXX");
+
+ fd = git_mkstemp_mode(tmp_file.buf, 0444);
+ if (fd < 0)
+ die_errno("unable to create '%s'", tmp_file.buf);
+
+ midx_name = strbuf_detach(&tmp_file, NULL);
+ rename_needed = 1;
+ } else {
+ unlink(midx_name);
+ fd = open(midx_name, O_CREAT|O_EXCL|O_WRONLY, 0600);
+ if (fd < 0)
+ die_errno("unable to create '%s'", midx_name);
+ }
+ f = sha1fd(fd, midx_name);
+
+ /* fill header info */
+ hdr.midx_signature = htonl(MIDX_SIGNATURE);
+ hdr.midx_version = htonl(MIDX_VERSION);
+
+ hdr.hash_version = MIDX_OID_VERSION;
+ hdr.hash_len = MIDX_OID_LEN;
+ hdr.num_base_midx = 0;
+ hdr.num_packs = htonl(nr_packs);
+
+ /*
+ * We expect the following chunks, which are required:
+ *
+ * Packfile Name Lookup
+ * Packfile Names
+ * OID Fanout
+ * OID Lookup
+ * Object Offsets
+ */
+ hdr.num_chunks = large_offset_needed ? 6 : 5;
+
+ /* write header to file */
+ assert(sizeof(hdr) == 16);
+ sha1write(f, &hdr, sizeof(hdr));
+
+ /*
+ * Fill initial chunk values using offsets
+ * relative to first chunk.
+ */
+ chunk_offsets[0] = sizeof(hdr) + 12 * (hdr.num_chunks + 1);
+ chunk_ids[0] = MIDX_CHUNKID_PACKLOOKUP;
+ chunk_offsets[1] = chunk_offsets[0] + nr_packs * 4;
+ chunk_ids[1] = MIDX_CHUNKID_OIDFANOUT;
+ chunk_offsets[2] = chunk_offsets[1] + 256 * 4;
+ chunk_ids[2] = MIDX_CHUNKID_OIDLOOKUP;
+ chunk_offsets[3] = chunk_offsets[2] + (uint64_t)count_distinct
+ * (uint64_t)hdr.hash_len;
+ chunk_ids[3] = MIDX_CHUNKID_OBJECTOFFSETS;
+ chunk_offsets[4] = chunk_offsets[3] + 8 * (uint64_t)count_distinct;
+
+ if (large_offset_needed) {
+ chunk_ids[4] = MIDX_CHUNKID_LARGEOFFSETS;
+ chunk_offsets[5] = chunk_offsets[4] + 8 * (uint64_t)nr_large_offset;
+ chunk_ids[5] = MIDX_CHUNKID_PACKNAMES;
+ chunk_offsets[6] = chunk_offsets[5] + total_name_len;
+ chunk_ids[6] = 0;
+ } else {
+ chunk_ids[4] = MIDX_CHUNKID_PACKNAMES;
+ chunk_offsets[5] = chunk_offsets[4] + total_name_len;
+ chunk_ids[5] = 0;
+ }
+
+ for (i = 0; i <= hdr.num_chunks; i++) {
+ uint32_t chunk_write[3];
+
+ chunk_write[0] = htonl(chunk_ids[i]);
+ chunk_write[1] = htonl(chunk_offsets[i] >> 32);
+ chunk_write[2] = htonl(chunk_offsets[i] & 0xffffffff);
+ sha1write(f, chunk_write, 12);
+ }
+
+ for (chunk = 0; chunk < hdr.num_chunks; chunk++) {
+ switch (chunk_ids[chunk]) {
+ case MIDX_CHUNKID_PACKLOOKUP:
+ write_midx_chunk_packlookup(f, pack_names, nr_packs);
+ break;
+
+ case MIDX_CHUNKID_PACKNAMES:
+ write_midx_chunk_packnames(f, pack_names, nr_packs);
+ break;
+
+ case MIDX_CHUNKID_OIDFANOUT:
+ write_midx_chunk_oidfanout(f, sorted_by_sha, nr_objects);
+ break;
+
+ case MIDX_CHUNKID_OIDLOOKUP:
+ write_midx_chunk_oidlookup(f, hdr.hash_len, sorted_by_sha,
+ nr_objects);
+ break;
+
+ case MIDX_CHUNKID_OBJECTOFFSETS:
+ write_midx_chunk_objectoffsets(f, large_offset_needed,
+ sorted_by_sha, nr_objects,
+ pack_perm);
+ break;
+
+ case MIDX_CHUNKID_LARGEOFFSETS:
+ write_midx_chunk_largeoffsets(f, nr_large_offset,
+ sorted_by_sha, nr_objects);
+ break;
+
+ case 0:
+ break;
+
+ default:
+ die("unrecognized MIDX chunk id: %08x", chunk_ids[chunk]);
+ }
+ }
+
+ sha1close(f, final_hash, CSUM_CLOSE | CSUM_FSYNC);
+
+ if (rename_needed)
+ {
+ struct object_id oid;
+ char *fname;
+
+ memcpy(oid.hash, final_hash, GIT_MAX_RAWSZ);
+ fname = get_midx_filename_oid(pack_dir, &oid);
+ final_hex = sha1_to_hex(final_hash);
+
+ if (rename(midx_name, fname))
+ die("failed to rename %s to %s", midx_name, fname);
+
+ free(fname);
+ } else {
+ final_hex = midx_name;
+ }
+
+ return final_hex;
+}
diff --git a/midx.h b/midx.h
new file mode 100644
index 0000000000..4b00463651
--- /dev/null
+++ b/midx.h
@@ -0,0 +1,42 @@
+#ifndef MIDX_H
+#define MIDX_H
+
+#include "git-compat-util.h"
+#include "object.h"
+#include "csum-file.h"
+
+extern char *get_midx_filename_oid(const char *pack_dir,
+ struct object_id *oid);
+
+struct pack_midx_entry {
+ struct object_id oid;
+ uint32_t pack_int_id;
+ off_t offset;
+};
+
+struct pack_midx_header {
+ uint32_t midx_signature;
+ uint32_t midx_version;
+ unsigned char hash_version;
+ unsigned char hash_len;
+ unsigned char num_base_midx;
+ unsigned char num_chunks;
+ uint32_t num_packs;
+};
+
+/*
+ * Write a single MIDX file storing the given entries for the
+ * given list of packfiles. If midx_name is null, then a temp
+ * file will be created and swapped using the result hash value.
+ * Otherwise, write directly to midx_name.
+ *
+ * Returns the final name of the MIDX file within pack_dir.
+ */
+extern const char *write_midx_file(const char *pack_dir,
+ const char *midx_name,
+ const char **pack_names,
+ uint32_t nr_packs,
+ struct pack_midx_entry **objects,
+ uint32_t nr_objects);
+
+#endif
--
2.15.0
next prev parent reply other threads:[~2018-01-07 18:15 UTC|newest]
Thread overview: 48+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-01-07 18:14 [RFC PATCH 00/18] Multi-pack index (MIDX) Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 01/18] docs: Multi-Pack Index (MIDX) Design Notes Derrick Stolee
2018-01-08 19:32 ` Jonathan Tan
2018-01-08 20:35 ` Derrick Stolee
2018-01-08 22:06 ` Jonathan Tan
2018-01-07 18:14 ` [RFC PATCH 02/18] midx: specify midx file format Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 03/18] midx: create core.midx config setting Derrick Stolee
2018-01-07 18:14 ` Derrick Stolee [this message]
2018-01-07 18:14 ` [RFC PATCH 05/18] midx: create midx builtin with --write mode Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 06/18] midx: add t5318-midx.sh test script Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 07/18] midx: teach midx --write to update midx-head Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 08/18] midx: teach git-midx to read midx file details Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 09/18] midx: find details of nth object in midx Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 10/18] midx: use existing midx when writing Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 11/18] midx: teach git-midx to clear midx files Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 12/18] midx: teach git-midx to delete expired files Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 13/18] t5318-midx.h: confirm git actions are stable Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 14/18] midx: load midx files when loading packs Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 15/18] midx: use midx for approximate object count Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 16/18] midx: nth_midxed_object_oid() and bsearch_midx() Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 17/18] sha1_name: use midx for abbreviations Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 18/18] packfile: use midx for object loads Derrick Stolee
2018-01-07 22:42 ` [RFC PATCH 00/18] Multi-pack index (MIDX) Ævar Arnfjörð Bjarmason
2018-01-08 0:08 ` Derrick Stolee
2018-01-08 10:20 ` Jeff King
2018-01-08 10:27 ` Jeff King
2018-01-08 12:28 ` Ævar Arnfjörð Bjarmason
2018-01-08 13:43 ` Johannes Schindelin
2018-01-09 6:50 ` Jeff King
2018-01-09 13:05 ` Johannes Schindelin
2018-01-09 19:51 ` Stefan Beller
2018-01-09 20:12 ` Junio C Hamano
2018-01-09 20:16 ` Stefan Beller
2018-01-09 21:31 ` Junio C Hamano
2018-01-10 17:05 ` Johannes Schindelin
2018-01-10 10:57 ` Jeff King
2018-01-08 13:43 ` Derrick Stolee
2018-01-09 7:12 ` Jeff King
2018-01-08 11:43 ` Ævar Arnfjörð Bjarmason
2018-06-06 8:13 ` Ævar Arnfjörð Bjarmason
2018-06-06 10:27 ` [RFC PATCH 0/2] unconditional O(1) SHA-1 abbreviation Ævar Arnfjörð Bjarmason
2018-06-06 10:27 ` [RFC PATCH 1/2] config.c: use braces on multiple conditional arms Ævar Arnfjörð Bjarmason
2018-06-06 10:27 ` [RFC PATCH 2/2] sha1-name: add core.validateAbbrev & relative core.abbrev Ævar Arnfjörð Bjarmason
2018-06-06 12:04 ` Christian Couder
2018-06-06 11:24 ` [RFC PATCH 00/18] Multi-pack index (MIDX) Derrick Stolee
2018-01-10 18:25 ` Martin Fick
2018-01-10 19:39 ` Derrick Stolee
2018-01-10 21:01 ` Martin Fick
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://vger.kernel.org/majordomo-info.html
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180107181459.222909-5-dstolee@microsoft.com \
--to=stolee@gmail.com \
--cc=Johannes.Shindelin@gmx.de \
--cc=dstolee@microsoft.com \
--cc=git@jeffhostetler.com \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
--cc=jrnieder@gmail.com \
--cc=peff@peff.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/mirrors/git.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).