git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: Derrick Stolee <stolee@gmail.com>
To: git@vger.kernel.org
Cc: dstolee@microsoft.com, stolee@gmail.com, git@jeffhostetler.com,
	peff@peff.net, gitster@pobox.com, Johannes.Shindelin@gmx.de,
	jrnieder@gmail.com
Subject: [RFC PATCH 04/18] midx: write multi-pack indexes for an object list
Date: Sun,  7 Jan 2018 13:14:45 -0500	[thread overview]
Message-ID: <20180107181459.222909-5-dstolee@microsoft.com> (raw)
In-Reply-To: <20180107181459.222909-1-dstolee@microsoft.com>

The write_midx_file() method takes a list of packfiles and indexed
objects with offset information and writes according to the format
in Documentation/technical/pack-format.txt. The chunks are separated
into methods.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Makefile |   1 +
 midx.c   | 412 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 midx.h   |  42 +++++++
 3 files changed, 455 insertions(+)
 create mode 100644 midx.c
 create mode 100644 midx.h

diff --git a/Makefile b/Makefile
index 2a81ae22e9..d0d810951f 100644
--- a/Makefile
+++ b/Makefile
@@ -827,6 +827,7 @@ LIB_OBJS += merge.o
 LIB_OBJS += merge-blobs.o
 LIB_OBJS += merge-recursive.o
 LIB_OBJS += mergesort.o
+LIB_OBJS += midx.o
 LIB_OBJS += mru.o
 LIB_OBJS += name-hash.o
 LIB_OBJS += notes.o
diff --git a/midx.c b/midx.c
new file mode 100644
index 0000000000..5c320726ed
--- /dev/null
+++ b/midx.c
@@ -0,0 +1,412 @@
+#include "cache.h"
+#include "git-compat-util.h"
+#include "pack.h"
+#include "packfile.h"
+#include "midx.h"
+
+#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
+#define MIDX_CHUNKID_PACKLOOKUP 0x504c4f4f /* "PLOO" */
+#define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
+#define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
+#define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
+#define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */
+#define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */
+
+#define MIDX_VERSION_1 1
+#define MIDX_VERSION MIDX_VERSION_1
+
+#define MIDX_OID_VERSION_SHA1 1
+#define MIDX_OID_LEN_SHA1 20
+#define MIDX_OID_VERSION MIDX_OID_VERSION_SHA1
+#define MIDX_OID_LEN MIDX_OID_LEN_SHA1
+
+#define MIDX_LARGE_OFFSET_NEEDED 0x80000000
+
+char* get_midx_filename_oid(const char *pack_dir,
+			    struct object_id *oid)
+{
+	struct strbuf head_path = STRBUF_INIT;
+	strbuf_addstr(&head_path, pack_dir);
+	strbuf_addstr(&head_path, "/midx-");
+	strbuf_addstr(&head_path, oid_to_hex(oid));
+	strbuf_addstr(&head_path, ".midx");
+
+	return strbuf_detach(&head_path, NULL);
+}
+
+struct pack_midx_details_internal {
+	uint32_t pack_int_id;
+	uint32_t internal_offset;
+};
+
+static int midx_sha1_compare(const void *_a, const void *_b)
+{
+	struct pack_midx_entry *a = *(struct pack_midx_entry **)_a;
+	struct pack_midx_entry *b = *(struct pack_midx_entry **)_b;
+	return oidcmp(&a->oid, &b->oid);
+}
+
+static void write_midx_chunk_packlookup(
+	struct sha1file *f,
+	const char **pack_names, uint32_t nr_packs)
+{
+	uint32_t i, cur_len = 0;
+
+	for (i = 0; i < nr_packs; i++) {
+		uint32_t swap_len = htonl(cur_len);
+		sha1write(f, &swap_len, 4);
+		cur_len += strlen(pack_names[i]) + 1;
+	}
+}
+
+static void write_midx_chunk_packnames(
+	struct sha1file *f,
+	const char **pack_names, uint32_t nr_packs)
+{
+	uint32_t i;
+	for (i = 0; i < nr_packs; i++)
+		sha1write(f, pack_names[i], strlen(pack_names[i]) + 1);
+}
+
+static void write_midx_chunk_oidfanout(
+	struct sha1file *f,
+	struct pack_midx_entry **objects, uint32_t nr_objects)
+{
+	struct pack_midx_entry **list = objects;
+	struct pack_midx_entry **last = objects + nr_objects;
+	uint32_t count_distinct = 0;
+	uint32_t i;
+
+	/*
+	* Write the first-level table (the list is sorted,
+	* but we use a 256-entry lookup to be able to avoid
+	* having to do eight extra binary search iterations).
+	*/
+	for (i = 0; i < 256; i++) {
+		struct pack_midx_entry **next = list;
+		struct pack_midx_entry *prev = 0;
+		uint32_t swap_distinct;
+
+		while (next < last) {
+			struct pack_midx_entry *obj = *next;
+			if (obj->oid.hash[0] != i)
+				break;
+
+			if (!prev || oidcmp(&(prev->oid), &(obj->oid)))
+				count_distinct++;
+
+			prev = obj;
+			next++;
+		}
+
+		swap_distinct = htonl(count_distinct);
+		sha1write(f, &swap_distinct, 4);
+		list = next;
+	}
+}
+
+static void write_midx_chunk_oidlookup(
+	struct sha1file *f, unsigned char hash_len,
+	struct pack_midx_entry **objects, uint32_t nr_objects)
+{
+	struct pack_midx_entry **list = objects;
+	struct object_id *last_oid = 0;
+	uint32_t i;
+
+	for (i = 0; i < nr_objects; i++) {
+		struct pack_midx_entry *obj = *list++;
+
+		if (last_oid && !oidcmp(last_oid, &obj->oid))
+			continue;
+
+		last_oid = &obj->oid;
+		sha1write(f, obj->oid.hash, (int)hash_len);
+	}
+}
+
+static void write_midx_chunk_objectoffsets(
+	struct sha1file *f, int large_offset_needed,
+	struct pack_midx_entry **objects, uint32_t nr_objects, uint32_t *pack_perm)
+{
+	struct pack_midx_entry **list = objects;
+	struct object_id *last_oid = 0;
+	uint32_t i, nr_large_offset = 0;
+
+	for (i = 0; i < nr_objects; i++) {
+		struct pack_midx_details_internal details;
+		struct pack_midx_entry *obj = *list++;
+
+		if (last_oid && !oidcmp(last_oid, &obj->oid))
+			continue;
+
+		last_oid = &obj->oid;
+
+		details.pack_int_id = htonl(pack_perm[obj->pack_int_id]);
+
+		if (large_offset_needed && obj->offset >> 31)
+			details.internal_offset = (MIDX_LARGE_OFFSET_NEEDED | nr_large_offset++);
+		else
+			details.internal_offset = (uint32_t)obj->offset;
+
+		details.internal_offset = htonl(details.internal_offset);
+		sha1write(f, &details, 8);
+	}
+}
+
+static void write_midx_chunk_largeoffsets(
+	struct sha1file *f, uint32_t nr_large_offset,
+	struct pack_midx_entry **objects, uint32_t nr_objects)
+{
+	struct pack_midx_entry **list = objects;
+	struct object_id *last_oid = 0;
+
+	while (nr_large_offset) {
+		struct pack_midx_entry *obj = *list++;
+		uint64_t offset = obj->offset;
+		uint32_t split[2];
+
+		if (last_oid && !oidcmp(last_oid, &obj->oid))
+			continue;
+
+		last_oid = &obj->oid;
+
+		if (!(offset >> 31))
+			continue;
+
+		split[0] = htonl(offset >> 32);
+		split[1] = htonl(offset & 0xffffffff);
+
+		sha1write(f, split, 8);
+		nr_large_offset--;
+	}
+}
+
+struct pack_pair {
+	uint32_t pack_int_id;
+	const char *pack_name;
+};
+
+static int pack_pair_compare(const void *_a, const void *_b)
+{
+	struct pack_pair *a = (struct pack_pair *)_a;
+	struct pack_pair *b = (struct pack_pair *)_b;
+	return strcmp(a->pack_name, b->pack_name);
+}
+
+static void sort_packs_by_name(const char **pack_names, uint32_t nr_packs, uint32_t *perm)
+{
+	uint32_t i;
+	struct pack_pair *pairs;
+
+	ALLOC_ARRAY(pairs, nr_packs);
+
+	for (i = 0; i < nr_packs; i++) {
+		pairs[i].pack_int_id = i;
+		pairs[i].pack_name = pack_names[i];
+	}
+
+	QSORT(pairs, nr_packs, pack_pair_compare);
+
+	for (i = 0; i < nr_packs; i++) {
+		pack_names[i] = pairs[i].pack_name;
+		perm[pairs[i].pack_int_id] = i;
+	}
+}
+
+const char *write_midx_file(const char *pack_dir,
+			    const char *midx_name,
+			    const char **pack_names,
+			    uint32_t nr_packs,
+			    struct pack_midx_entry **objects,
+			    uint32_t nr_objects)
+{
+	struct sha1file *f;
+	struct pack_midx_entry **sorted_by_sha;
+	int i, chunk, fd;
+	struct pack_midx_header hdr;
+	uint32_t chunk_ids[7];
+	uint64_t chunk_offsets[7];
+	unsigned char large_offset_needed = 0;
+	unsigned int nr_large_offset = 0;
+	unsigned char final_hash[GIT_MAX_RAWSZ];
+	const char *final_hex;
+	int rename_needed = 0;
+	uint32_t count_distinct = 0;
+	int total_name_len = 0;
+	uint32_t *pack_perm;
+
+	if (!core_midx)
+		return 0;
+
+	/* Sort packs */
+	if (nr_packs) {
+		ALLOC_ARRAY(pack_perm, nr_packs);
+		sort_packs_by_name(pack_names, nr_packs, pack_perm);
+	} else {
+		pack_perm = 0;
+	}
+
+	/* Sort objects */
+	if (nr_objects) {
+		sorted_by_sha = objects;
+
+		QSORT(sorted_by_sha, nr_objects, midx_sha1_compare);
+
+		for (i = 0; i < nr_objects; i++) {
+			if (i &&
+			    !oidcmp(&sorted_by_sha[i-1]->oid, &sorted_by_sha[i]->oid))
+				continue;
+
+			count_distinct++;
+
+			if (sorted_by_sha[i]->offset > 0x7fffffff)
+				nr_large_offset++;
+			if (sorted_by_sha[i]->offset > 0xffffffff)
+				large_offset_needed = 1;
+		}
+	} else {
+		sorted_by_sha = NULL;
+	}
+
+	for (i = 0; i < nr_packs; i++)
+		total_name_len += strlen(pack_names[i]) + 1;
+
+	/* open temp file, or direct file if given */
+	if (!midx_name) {
+		struct strbuf tmp_file = STRBUF_INIT;
+		strbuf_addstr(&tmp_file, pack_dir);
+		strbuf_addstr(&tmp_file, "/tmp_midx_XXXXXX");
+
+		fd = git_mkstemp_mode(tmp_file.buf, 0444);
+		if (fd < 0)
+			die_errno("unable to create '%s'", tmp_file.buf);
+
+		midx_name = strbuf_detach(&tmp_file, NULL);
+		rename_needed = 1;
+	} else {
+		unlink(midx_name);
+		fd = open(midx_name, O_CREAT|O_EXCL|O_WRONLY, 0600);
+		if (fd < 0)
+			die_errno("unable to create '%s'", midx_name);
+	}
+	f = sha1fd(fd, midx_name);
+
+	/* fill header info */
+	hdr.midx_signature = htonl(MIDX_SIGNATURE);
+	hdr.midx_version = htonl(MIDX_VERSION);
+
+	hdr.hash_version = MIDX_OID_VERSION;
+	hdr.hash_len = MIDX_OID_LEN;
+	hdr.num_base_midx = 0;
+	hdr.num_packs = htonl(nr_packs);
+
+	/*
+	 * We expect the following chunks, which are required:
+	 *
+	 * Packfile Name Lookup
+	 * Packfile Names
+	 * OID Fanout
+	 * OID Lookup
+	 * Object Offsets
+	 */
+	hdr.num_chunks = large_offset_needed ? 6 : 5;
+
+	/* write header to file */
+	assert(sizeof(hdr) == 16);
+	sha1write(f, &hdr, sizeof(hdr));
+
+	/*
+	 * Fill initial chunk values using offsets
+	 * relative to first chunk.
+	 */
+	chunk_offsets[0] = sizeof(hdr) + 12 * (hdr.num_chunks + 1);
+	chunk_ids[0] = MIDX_CHUNKID_PACKLOOKUP;
+	chunk_offsets[1] = chunk_offsets[0] + nr_packs * 4;
+	chunk_ids[1] = MIDX_CHUNKID_OIDFANOUT;
+	chunk_offsets[2] = chunk_offsets[1] + 256 * 4;
+	chunk_ids[2] = MIDX_CHUNKID_OIDLOOKUP;
+	chunk_offsets[3] = chunk_offsets[2] + (uint64_t)count_distinct
+					    * (uint64_t)hdr.hash_len;
+	chunk_ids[3] = MIDX_CHUNKID_OBJECTOFFSETS;
+	chunk_offsets[4] = chunk_offsets[3] + 8 * (uint64_t)count_distinct;
+
+	if (large_offset_needed) {
+		chunk_ids[4] = MIDX_CHUNKID_LARGEOFFSETS;
+		chunk_offsets[5] = chunk_offsets[4] + 8 * (uint64_t)nr_large_offset;
+		chunk_ids[5] = MIDX_CHUNKID_PACKNAMES;
+		chunk_offsets[6] = chunk_offsets[5] + total_name_len;
+		chunk_ids[6] = 0;
+	} else {
+		chunk_ids[4] = MIDX_CHUNKID_PACKNAMES;
+		chunk_offsets[5] = chunk_offsets[4] + total_name_len;
+		chunk_ids[5] = 0;
+	}
+
+	for (i = 0; i <= hdr.num_chunks; i++) {
+		uint32_t chunk_write[3];
+
+		chunk_write[0] = htonl(chunk_ids[i]);
+		chunk_write[1] = htonl(chunk_offsets[i] >> 32);
+		chunk_write[2] = htonl(chunk_offsets[i] & 0xffffffff);
+		sha1write(f, chunk_write, 12);
+	}
+
+	for (chunk = 0; chunk < hdr.num_chunks; chunk++) {
+		switch (chunk_ids[chunk]) {
+		case MIDX_CHUNKID_PACKLOOKUP:
+			write_midx_chunk_packlookup(f, pack_names, nr_packs);
+			break;
+
+		case MIDX_CHUNKID_PACKNAMES:
+			write_midx_chunk_packnames(f, pack_names, nr_packs);
+			break;
+
+		case MIDX_CHUNKID_OIDFANOUT:
+			write_midx_chunk_oidfanout(f, sorted_by_sha, nr_objects);
+			break;
+
+		case MIDX_CHUNKID_OIDLOOKUP:
+			write_midx_chunk_oidlookup(f, hdr.hash_len, sorted_by_sha,
+						   nr_objects);
+			break;
+
+		case MIDX_CHUNKID_OBJECTOFFSETS:
+			write_midx_chunk_objectoffsets(f, large_offset_needed,
+						       sorted_by_sha, nr_objects,
+						       pack_perm);
+			break;
+
+		case MIDX_CHUNKID_LARGEOFFSETS:
+			write_midx_chunk_largeoffsets(f, nr_large_offset,
+						      sorted_by_sha, nr_objects);
+			break;
+
+		case 0:
+			break;
+
+		default:
+			die("unrecognized MIDX chunk id: %08x", chunk_ids[chunk]);
+		}
+	}
+
+	sha1close(f, final_hash, CSUM_CLOSE | CSUM_FSYNC);
+
+	if (rename_needed)
+	{
+		struct object_id oid;
+		char *fname;
+
+		memcpy(oid.hash, final_hash, GIT_MAX_RAWSZ);
+		fname = get_midx_filename_oid(pack_dir, &oid);
+		final_hex = sha1_to_hex(final_hash);
+
+		if (rename(midx_name, fname))
+			die("failed to rename %s to %s", midx_name, fname);
+
+		free(fname);
+	} else {
+		final_hex = midx_name;
+	}
+
+	return final_hex;
+}
diff --git a/midx.h b/midx.h
new file mode 100644
index 0000000000..4b00463651
--- /dev/null
+++ b/midx.h
@@ -0,0 +1,42 @@
+#ifndef MIDX_H
+#define MIDX_H
+
+#include "git-compat-util.h"
+#include "object.h"
+#include "csum-file.h"
+
+extern char *get_midx_filename_oid(const char *pack_dir,
+				   struct object_id *oid);
+
+struct pack_midx_entry {
+	struct object_id oid;
+	uint32_t pack_int_id;
+	off_t offset;
+};
+
+struct pack_midx_header {
+	uint32_t midx_signature;
+	uint32_t midx_version;
+	unsigned char hash_version;
+	unsigned char hash_len;
+	unsigned char num_base_midx;
+	unsigned char num_chunks;
+	uint32_t num_packs;
+};
+
+/*
+ * Write a single MIDX file storing the given entries for the
+ * given list of packfiles. If midx_name is null, then a temp
+ * file will be created and swapped using the result hash value.
+ * Otherwise, write directly to midx_name.
+ *
+ * Returns the final name of the MIDX file within pack_dir.
+ */
+extern const char *write_midx_file(const char *pack_dir,
+				   const char *midx_name,
+				   const char **pack_names,
+				   uint32_t nr_packs,
+				   struct pack_midx_entry **objects,
+				   uint32_t nr_objects);
+
+#endif
-- 
2.15.0


  parent reply	other threads:[~2018-01-07 18:15 UTC|newest]

Thread overview: 48+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-01-07 18:14 [RFC PATCH 00/18] Multi-pack index (MIDX) Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 01/18] docs: Multi-Pack Index (MIDX) Design Notes Derrick Stolee
2018-01-08 19:32   ` Jonathan Tan
2018-01-08 20:35     ` Derrick Stolee
2018-01-08 22:06       ` Jonathan Tan
2018-01-07 18:14 ` [RFC PATCH 02/18] midx: specify midx file format Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 03/18] midx: create core.midx config setting Derrick Stolee
2018-01-07 18:14 ` Derrick Stolee [this message]
2018-01-07 18:14 ` [RFC PATCH 05/18] midx: create midx builtin with --write mode Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 06/18] midx: add t5318-midx.sh test script Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 07/18] midx: teach midx --write to update midx-head Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 08/18] midx: teach git-midx to read midx file details Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 09/18] midx: find details of nth object in midx Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 10/18] midx: use existing midx when writing Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 11/18] midx: teach git-midx to clear midx files Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 12/18] midx: teach git-midx to delete expired files Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 13/18] t5318-midx.h: confirm git actions are stable Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 14/18] midx: load midx files when loading packs Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 15/18] midx: use midx for approximate object count Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 16/18] midx: nth_midxed_object_oid() and bsearch_midx() Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 17/18] sha1_name: use midx for abbreviations Derrick Stolee
2018-01-07 18:14 ` [RFC PATCH 18/18] packfile: use midx for object loads Derrick Stolee
2018-01-07 22:42 ` [RFC PATCH 00/18] Multi-pack index (MIDX) Ævar Arnfjörð Bjarmason
2018-01-08  0:08   ` Derrick Stolee
2018-01-08 10:20     ` Jeff King
2018-01-08 10:27       ` Jeff King
2018-01-08 12:28         ` Ævar Arnfjörð Bjarmason
2018-01-08 13:43       ` Johannes Schindelin
2018-01-09  6:50         ` Jeff King
2018-01-09 13:05           ` Johannes Schindelin
2018-01-09 19:51             ` Stefan Beller
2018-01-09 20:12               ` Junio C Hamano
2018-01-09 20:16                 ` Stefan Beller
2018-01-09 21:31                   ` Junio C Hamano
2018-01-10 17:05               ` Johannes Schindelin
2018-01-10 10:57             ` Jeff King
2018-01-08 13:43       ` Derrick Stolee
2018-01-09  7:12         ` Jeff King
2018-01-08 11:43     ` Ævar Arnfjörð Bjarmason
2018-06-06  8:13     ` Ævar Arnfjörð Bjarmason
2018-06-06 10:27       ` [RFC PATCH 0/2] unconditional O(1) SHA-1 abbreviation Ævar Arnfjörð Bjarmason
2018-06-06 10:27       ` [RFC PATCH 1/2] config.c: use braces on multiple conditional arms Ævar Arnfjörð Bjarmason
2018-06-06 10:27       ` [RFC PATCH 2/2] sha1-name: add core.validateAbbrev & relative core.abbrev Ævar Arnfjörð Bjarmason
2018-06-06 12:04         ` Christian Couder
2018-06-06 11:24       ` [RFC PATCH 00/18] Multi-pack index (MIDX) Derrick Stolee
2018-01-10 18:25 ` Martin Fick
2018-01-10 19:39   ` Derrick Stolee
2018-01-10 21:01     ` Martin Fick

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180107181459.222909-5-dstolee@microsoft.com \
    --to=stolee@gmail.com \
    --cc=Johannes.Shindelin@gmx.de \
    --cc=dstolee@microsoft.com \
    --cc=git@jeffhostetler.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=jrnieder@gmail.com \
    --cc=peff@peff.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).