git@vger.kernel.org list mirror (unofficial, one of many)
 help / color / mirror / code / Atom feed
From: Jonathan Tan <jonathantanmy@google.com>
To: me@ttaylorr.com
Cc: git@vger.kernel.org, peff@peff.net, dstolee@microsoft.com,
	gitster@pobox.com, jonathantanmy@google.com
Subject: Re: [PATCH 12/22] pack-bitmap: read multi-pack bitmaps
Date: Thu, 15 Apr 2021 19:39:25 -0700	[thread overview]
Message-ID: <20210416023925.16736-1-jonathantanmy@google.com> (raw)
In-Reply-To: <d5eeca4f112f70343b069fcb68fe61e26831843f.1617991824.git.me@ttaylorr.com>

I'll review until this patch for now. Hopefully I'll get to the rest
soon.

> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index 5205dde2e1..a4e4e4ebcc 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -984,7 +984,17 @@ static void write_reused_pack(struct hashfile *f)
>  				break;
>  
>  			offset += ewah_bit_ctz64(word >> offset);
> -			write_reused_pack_one(pos + offset, f, &w_curs);
> +			if (bitmap_is_midx(bitmap_git)) {
> +				off_t pack_offs = bitmap_pack_offset(bitmap_git,
> +								     pos + offset);
> +				uint32_t pos;
> +
> +				if (offset_to_pack_pos(reuse_packfile, pack_offs, &pos) < 0)
> +					die(_("write_reused_pack: could not locate %"PRIdMAX),
> +					    (intmax_t)pack_offs);
> +				write_reused_pack_one(pos, f, &w_curs);
> +			} else
> +				write_reused_pack_one(pos + offset, f, &w_curs);
>  			display_progress(progress_state, ++written);
>  		}
>  	}

When bitmaps are used, pos + offset is the pseudo-pack (a virtual
concatenation of all packfiles in the MIDX) position (as in, first
object is 0, second object is 1, and so on), not a position in
a single packfile. From it, we obtain a pack offset, and from it, we
obtain a position in the reused packfile (reuse_packfile). In this way,
the code is equivalent to the non-MIDX case. Looks good.

(There is no need to select a packfile here in the case of MIDX because,
as the code later shows, we always reuse only one packfile - assigned to
reuse_packfile.)

> @@ -35,8 +36,15 @@ struct stored_bitmap {
>   * the active bitmap index is the largest one.
>   */
>  struct bitmap_index {
> -	/* Packfile to which this bitmap index belongs to */
> +	/*
> +	 * The pack or multi-pack index (MIDX) that this bitmap index belongs
> +	 * to.
> +	 *
> +	 * Exactly one of these must be non-NULL; this specifies the object
> +	 * order used to interpret this bitmap.
> +	 */
>  	struct packed_git *pack;
> +	struct multi_pack_index *midx;

Makes sense.

> @@ -71,6 +79,8 @@ struct bitmap_index {
>  	/* If not NULL, this is a name-hash cache pointing into map. */
>  	uint32_t *hashes;
>  
> +	const unsigned char *checksum;
> +
>  	/*
>  	 * Extended index.
>  	 *

I see later that this checksum is used, OK. Maybe comment that this
points into map (just like "hashes", as quoted above).

> @@ -281,6 +304,54 @@ static char *pack_bitmap_filename(struct packed_git *p)
>  	return xstrfmt("%.*s.bitmap", (int)len, p->pack_name);
>  }
>  
> +static int open_midx_bitmap_1(struct bitmap_index *bitmap_git,
> +			      struct multi_pack_index *midx)
> +{
> +	struct stat st;
> +	char *idx_name = midx_bitmap_filename(midx);
> +	int fd = git_open(idx_name);
> +
> +	free(idx_name);
> +
> +	if (fd < 0)
> +		return -1;
> +
> +	if (fstat(fd, &st)) {
> +		close(fd);
> +		return -1;
> +	}
> +
> +	if (bitmap_git->pack || bitmap_git->midx) {
> +		/* ignore extra bitmap file; we can only handle one */
> +		return -1;

Here, fd is not closed? Maybe better to have multiple cleanup stages
(one when the mmap has been built, and one when not).

> @@ -302,12 +373,18 @@ static int open_pack_bitmap_1(struct bitmap_index *bitmap_git, struct packed_git
>  		return -1;
>  	}
>  
> -	if (bitmap_git->pack) {
> +	if (bitmap_git->pack || bitmap_git->midx) {
> +		/* ignore extra bitmap file; we can only handle one */
>  		warning("ignoring extra bitmap file: %s", packfile->pack_name);
>  		close(fd);
>  		return -1;
>  	}
>  
> +	if (!is_pack_valid(packfile)) {
> +		close(fd);
> +		return -1;
> +	}

Why is this needed now (and presumably, not before)?

> -static int load_pack_bitmap(struct bitmap_index *bitmap_git)
> +static int load_reverse_index(struct bitmap_index *bitmap_git)
> +{
> +	if (bitmap_is_midx(bitmap_git)) {
> +		uint32_t i;
> +		int ret;
> +
> +		ret = load_midx_revindex(bitmap_git->midx);
> +		if (ret)
> +			return ret;
> +
> +		for (i = 0; i < bitmap_git->midx->num_packs; i++) {
> +			if (prepare_midx_pack(the_repository, bitmap_git->midx, i))
> +				die(_("load_reverse_index: could not open pack"));
> +			ret = load_pack_revindex(bitmap_git->midx->packs[i]);

I was thinking about why we still need per-pack revindex, but I think
the answer is that we still need to convert pack offsets (roughly
speaking, 0 to size of packfile in bytes) to pack positions (0 to number
of objects) (and one such conversion is in the quoted section of
builtin/pack-objects.c above), and MIDX does not provide this. OK, makes
sense.

> +			if (ret)
> +				return ret;
> +		}
> +		return 0;
> +	}
> +	return load_pack_revindex(bitmap_git->pack);
> +}

[snip]

> @@ -428,10 +552,26 @@ static inline int bitmap_position_packfile(struct bitmap_index *bitmap_git,
>  	return pos;
>  }
>  
> +static int bitmap_position_midx(struct bitmap_index *bitmap_git,
> +				const struct object_id *oid)
> +{
> +	uint32_t want, got;
> +	if (!bsearch_midx(oid, bitmap_git->midx, &want))
> +		return -1;
> +
> +	if (midx_to_pack_pos(bitmap_git->midx, want, &got) < 0)
> +		return -1;
> +	return got;
> +}

bsearch_midx() gives us the position in the MIDX (e.g. if we had an
object with the name 00...00, "want" will be 0, and if we had an
object with the name ff...ff, "want" will be the number of objects
minus 1). midx_to_pack_pos() converts that into the position in the
pseudo-pack, which is what we want. OK.

> @@ -730,14 +871,28 @@ static void show_objects_for_type(
>  
>  			offset += ewah_bit_ctz64(word >> offset);
>  
> -			index_pos = pack_pos_to_index(bitmap_git->pack, pos + offset);
> -			ofs = pack_pos_to_offset(bitmap_git->pack, pos + offset);
> -			nth_packed_object_id(&oid, bitmap_git->pack, index_pos);
> +			if (bitmap_is_midx(bitmap_git)) {
> +				struct multi_pack_index *m = bitmap_git->midx;
> +				uint32_t pack_id;
> +
> +				index_pos = pack_pos_to_midx(m, pos + offset);
> +				ofs = nth_midxed_offset(m, index_pos);
> +				nth_midxed_object_oid(&oid, m, index_pos);
> +
> +				pack_id = nth_midxed_pack_int_id(m, index_pos);
> +				pack = bitmap_git->midx->packs[pack_id];

This is similar to the builtin/pack-objects.c case right at the start of
this patch. (bitmap_pack_offset(), used in builtin/pack-objects.c, is
pack_pos_to_midx() and nth_midx_offset() chained.) OK.

> +			} else {
> +				index_pos = pack_pos_to_index(bitmap_git->pack, pos + offset);
> +				ofs = pack_pos_to_offset(bitmap_git->pack, pos + offset);
> +				nth_bitmap_object_oid(bitmap_git, &oid, index_pos);
> +
> +				pack = bitmap_git->pack;
> +			}
>  
>  			if (bitmap_git->hashes)
>  				hash = get_be32(bitmap_git->hashes + index_pos);
>  
> -			show_reach(&oid, object_type, 0, hash, bitmap_git->pack, ofs);
> +			show_reach(&oid, object_type, 0, hash, pack, ofs);
>  		}
>  	}
>  }
> @@ -749,8 +904,13 @@ static int in_bitmapped_pack(struct bitmap_index *bitmap_git,
>  		struct object *object = roots->item;
>  		roots = roots->next;
>  
> -		if (find_pack_entry_one(object->oid.hash, bitmap_git->pack) > 0)
> -			return 1;
> +		if (bitmap_is_midx(bitmap_git)) {
> +			if (bsearch_midx(&object->oid, bitmap_git->midx, NULL))
> +				return 1;
> +		} else {
> +			if (find_pack_entry_one(object->oid.hash, bitmap_git->pack) > 0)
> +				return 1;
> +		}
>  	}
>  
>  	return 0;

OK - we don't actually care about the position, just that it exists,
which is why we can pass NULL as the last argument to bsearch_midx().

> @@ -839,14 +999,26 @@ static void filter_bitmap_blob_none(struct bitmap_index *bitmap_git,
>  static unsigned long get_size_by_pos(struct bitmap_index *bitmap_git,
>  				     uint32_t pos)
>  {
> -	struct packed_git *pack = bitmap_git->pack;
>  	unsigned long size;
>  	struct object_info oi = OBJECT_INFO_INIT;
>  
>  	oi.sizep = &size;
>  
>  	if (pos < bitmap_num_objects(bitmap_git)) {
> -		off_t ofs = pack_pos_to_offset(pack, pos);
> +		struct packed_git *pack;
> +		off_t ofs;
> +
> +		if (bitmap_is_midx(bitmap_git)) {
> +			uint32_t midx_pos = pack_pos_to_midx(bitmap_git->midx, pos);
> +			uint32_t pack_id = nth_midxed_pack_int_id(bitmap_git->midx, midx_pos);
> +
> +			pack = bitmap_git->midx->packs[pack_id];
> +			ofs = nth_midxed_offset(bitmap_git->midx, midx_pos);
> +		} else {
> +			pack = bitmap_git->pack;
> +			ofs = pack_pos_to_offset(pack, pos);
> +		}
> +
>  		if (packed_object_info(the_repository, pack, ofs, &oi) < 0) {
>  			struct object_id oid;
>  			nth_bitmap_object_oid(bitmap_git, &oid,

Makes sense - "pos" is the position in the pseudo-pack. From it we get
the MIDX position, and then we can get the pack ID and pack offset as
usual.

> @@ -1081,15 +1253,29 @@ static void try_partial_reuse(struct bitmap_index *bitmap_git,
>  			      struct bitmap *reuse,
>  			      struct pack_window **w_curs)
>  {
> -	off_t offset, header;
> +	struct packed_git *pack;
> +	off_t offset, delta_obj_offset;
>  	enum object_type type;
>  	unsigned long size;
>  
>  	if (pos >= bitmap_num_objects(bitmap_git))
>  		return; /* not actually in the pack or MIDX */
>  
> -	offset = header = pack_pos_to_offset(bitmap_git->pack, pos);
> -	type = unpack_object_header(bitmap_git->pack, w_curs, &offset, &size);
> +	if (bitmap_is_midx(bitmap_git)) {
> +		uint32_t pack_id, midx_pos;
> +
> +		midx_pos = pack_pos_to_midx(bitmap_git->midx, pos);
> +		pack_id = nth_midxed_pack_int_id(bitmap_git->midx, midx_pos);
> +
> +		pack = bitmap_git->midx->packs[pack_id];
> +		offset = nth_midxed_offset(bitmap_git->midx, midx_pos);

Would it be useful to assert somewhere here that "pack" is the preferred
pack?

Going further, is it reasonable to say that positions 0..n in the
preferred pack (where n is the number of objects in the preferred pack)
match positions 0..n in the pseudo-pack exactly? If yes, maybe we can
simplify things by explaining that we can operate in the MIDX case
exactly (or as similarly as possible) like we operate on a single
packfile because of this, instead of always needing to consider if a
delta base could appear in the MIDX as belonging to another packfile.

> @@ -1538,6 +1792,29 @@ static off_t get_disk_usage_for_type(struct bitmap_index *bitmap_git,
>  
>  			offset += ewah_bit_ctz64(word >> offset);
>  			pos = base + offset;
> +
> +			if (bitmap_is_midx(bitmap_git)) {
> +				uint32_t pack_pos;
> +				uint32_t midx_pos = pack_pos_to_midx(bitmap_git->midx, pos);
> +				uint32_t pack_id = nth_midxed_pack_int_id(bitmap_git->midx, midx_pos);
> +				off_t offset = nth_midxed_offset(bitmap_git->midx, midx_pos);
> +
> +				pack = bitmap_git->midx->packs[pack_id];
> +
> +				if (offset_to_pack_pos(pack, offset, &pack_pos) < 0) {
> +					struct object_id oid;
> +					nth_midxed_object_oid(&oid, bitmap_git->midx, midx_pos);
> +
> +					die(_("could not find %s in pack #%"PRIu32" at offset %"PRIuMAX),
> +					    oid_to_hex(&oid),
> +					    pack_id,
> +					    (uintmax_t)offset);
> +				}
> +
> +				pos = pack_pos;
> +			} else
> +				pack = bitmap_git->pack;
> +
>  			total += pack_pos_to_offset(pack, pos + 1) -
>  				 pack_pos_to_offset(pack, pos);
>  		}

"pos" is assigned to twice in the MIDX case (with different semantics).
I think it's better to do it like in the rest of the patch - use "base +
offset" as the argument to pack_pos_to_midx, and then you wouldn't need
to assign to "pos" twice.

> diff --git a/packfile.c b/packfile.c
> index 8668345d93..c444e365a3 100644
> --- a/packfile.c
> +++ b/packfile.c
> @@ -863,7 +863,7 @@ static void prepare_pack(const char *full_name, size_t full_name_len,
>  	if (!strcmp(file_name, "multi-pack-index"))
>  		return;
>  	if (starts_with(file_name, "multi-pack-index") &&
> -	    ends_with(file_name, ".rev"))
> +	    (ends_with(file_name, ".bitmap") || ends_with(file_name, ".rev")))
>  		return;
>  	if (ends_with(file_name, ".idx") ||
>  	    ends_with(file_name, ".rev") ||

I guess this will come into play when we start writing MIDX bitmaps?

  reply	other threads:[~2021-04-16  2:39 UTC|newest]

Thread overview: 57+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-09 18:10 [PATCH 00/22] multi-pack reachability bitmaps Taylor Blau
2021-04-09 18:10 ` [PATCH 01/22] pack-bitmap.c: harden 'test_bitmap_walk()' to check type bitmaps Taylor Blau
2021-04-09 18:10 ` [PATCH 02/22] pack-bitmap-write.c: gracefully fail to write non-closed bitmaps Taylor Blau
2021-04-16  2:46   ` Jonathan Tan
2021-04-09 18:10 ` [PATCH 03/22] pack-bitmap-write.c: free existing bitmaps Taylor Blau
2021-04-09 18:10 ` [PATCH 04/22] Documentation: build 'technical/bitmap-format' by default Taylor Blau
2021-04-09 18:11 ` [PATCH 05/22] Documentation: describe MIDX-based bitmaps Taylor Blau
2021-04-09 18:11 ` [PATCH 06/22] midx: make a number of functions non-static Taylor Blau
2021-04-09 18:11 ` [PATCH 07/22] midx: clear auxiliary .rev after replacing the MIDX Taylor Blau
2021-04-09 18:11 ` [PATCH 08/22] midx: respect 'core.multiPackIndex' when writing Taylor Blau
2021-04-09 18:11 ` [PATCH 09/22] pack-bitmap.c: introduce 'bitmap_num_objects()' Taylor Blau
2021-04-09 18:11 ` [PATCH 10/22] pack-bitmap.c: introduce 'nth_bitmap_object_oid()' Taylor Blau
2021-04-09 18:11 ` [PATCH 11/22] pack-bitmap.c: introduce 'bitmap_is_preferred_refname()' Taylor Blau
2021-04-09 18:11 ` [PATCH 12/22] pack-bitmap: read multi-pack bitmaps Taylor Blau
2021-04-16  2:39   ` Jonathan Tan [this message]
2021-04-16  3:13     ` Taylor Blau
2021-04-09 18:11 ` [PATCH 13/22] pack-bitmap: write " Taylor Blau
2021-05-04  5:02   ` Jonathan Tan
2021-05-06 20:18     ` Taylor Blau
2021-05-06 22:00       ` Jonathan Tan
2021-04-09 18:11 ` [PATCH 14/22] t5310: move some tests to lib-bitmap.sh Taylor Blau
2021-04-09 18:11 ` [PATCH 15/22] t/helper/test-read-midx.c: add --checksum mode Taylor Blau
2021-04-09 18:12 ` [PATCH 16/22] t5326: test multi-pack bitmap behavior Taylor Blau
2021-05-04 17:51   ` Jonathan Tan
2021-04-09 18:12 ` [PATCH 17/22] t5310: disable GIT_TEST_MULTI_PACK_INDEX_WRITE_BITMAP Taylor Blau
2021-04-09 18:12 ` [PATCH 18/22] t5319: don't write MIDX bitmaps in t5319 Taylor Blau
2021-04-09 18:12 ` [PATCH 19/22] t7700: update to work with MIDX bitmap test knob Taylor Blau
2021-04-09 18:12 ` [PATCH 20/22] midx: respect 'GIT_TEST_MULTI_PACK_INDEX_WRITE_BITMAP' Taylor Blau
2021-04-09 18:12 ` [PATCH 21/22] p5310: extract full and partial bitmap tests Taylor Blau
2021-04-09 18:12 ` [PATCH 22/22] p5326: perf tests for MIDX bitmaps Taylor Blau
2021-05-04 18:00   ` Jonathan Tan
2021-05-05  0:55     ` Junio C Hamano
2021-06-21 22:24 ` [PATCH v2 00/24] multi-pack reachability bitmaps Taylor Blau
2021-06-21 22:24   ` [PATCH v2 01/24] pack-bitmap.c: harden 'test_bitmap_walk()' to check type bitmaps Taylor Blau
2021-06-21 22:25   ` [PATCH v2 02/24] pack-bitmap-write.c: gracefully fail to write non-closed bitmaps Taylor Blau
2021-06-21 22:25   ` [PATCH v2 03/24] pack-bitmap-write.c: free existing bitmaps Taylor Blau
2021-06-21 22:25   ` [PATCH v2 04/24] Documentation: build 'technical/bitmap-format' by default Taylor Blau
2021-06-21 22:25   ` [PATCH v2 05/24] Documentation: describe MIDX-based bitmaps Taylor Blau
2021-06-21 22:25   ` [PATCH v2 06/24] midx: make a number of functions non-static Taylor Blau
2021-06-21 22:25   ` [PATCH v2 07/24] midx: clear auxiliary .rev after replacing the MIDX Taylor Blau
2021-06-21 22:25   ` [PATCH v2 08/24] midx: respect 'core.multiPackIndex' when writing Taylor Blau
2021-06-21 22:25   ` [PATCH v2 09/24] midx: infer preferred pack when not given one Taylor Blau
2021-06-21 22:25   ` [PATCH v2 10/24] pack-bitmap.c: introduce 'bitmap_num_objects()' Taylor Blau
2021-06-21 22:25   ` [PATCH v2 11/24] pack-bitmap.c: introduce 'nth_bitmap_object_oid()' Taylor Blau
2021-06-21 22:25   ` [PATCH v2 12/24] pack-bitmap.c: introduce 'bitmap_is_preferred_refname()' Taylor Blau
2021-06-21 22:25   ` [PATCH v2 13/24] pack-bitmap: read multi-pack bitmaps Taylor Blau
2021-06-21 22:25   ` [PATCH v2 14/24] pack-bitmap: write " Taylor Blau
2021-06-21 22:25   ` [PATCH v2 15/24] t5310: move some tests to lib-bitmap.sh Taylor Blau
2021-06-21 22:25   ` [PATCH v2 16/24] t/helper/test-read-midx.c: add --checksum mode Taylor Blau
2021-06-21 22:25   ` [PATCH v2 17/24] t5326: test multi-pack bitmap behavior Taylor Blau
2021-06-21 22:25   ` [PATCH v2 18/24] t0410: disable GIT_TEST_MULTI_PACK_INDEX_WRITE_BITMAP Taylor Blau
2021-06-21 22:25   ` [PATCH v2 19/24] t5310: " Taylor Blau
2021-06-21 22:25   ` [PATCH v2 20/24] t5319: don't write MIDX bitmaps in t5319 Taylor Blau
2021-06-21 22:25   ` [PATCH v2 21/24] t7700: update to work with MIDX bitmap test knob Taylor Blau
2021-06-21 22:25   ` [PATCH v2 22/24] midx: respect 'GIT_TEST_MULTI_PACK_INDEX_WRITE_BITMAP' Taylor Blau
2021-06-21 22:25   ` [PATCH v2 23/24] p5310: extract full and partial bitmap tests Taylor Blau
2021-06-21 22:26   ` [PATCH v2 24/24] p5326: perf tests for MIDX bitmaps Taylor Blau

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210416023925.16736-1-jonathantanmy@google.com \
    --to=jonathantanmy@google.com \
    --cc=dstolee@microsoft.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=me@ttaylorr.com \
    --cc=peff@peff.net \
    --subject='Re: [PATCH 12/22] pack-bitmap: read multi-pack bitmaps' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

git@vger.kernel.org list mirror (unofficial, one of many)

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://public-inbox.org/git
	git clone --mirror http://ou63pmih66umazou.onion/git
	git clone --mirror http://czquwvybam4bgbro.onion/git
	git clone --mirror http://hjrcffqmbrq6wope.onion/git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V1 git git/ https://public-inbox.org/git \
		git@vger.kernel.org
	public-inbox-index git

Example config snippet for mirrors.
Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.version-control.git
	nntp://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/inbox.comp.version-control.git
	nntp://ie5yzdi7fg72h7s4sdcztq5evakq23rdt33mfyfcddc5u3ndnw24ogqd.onion/inbox.comp.version-control.git
	nntp://4uok3hntl7oi7b4uf4rtfwefqeexfzil2w6kgk2jn5z2f764irre7byd.onion/inbox.comp.version-control.git
	nntp://news.gmane.io/gmane.comp.version-control.git
 note: .onion URLs require Tor: https://www.torproject.org/

code repositories for project(s) associated with this inbox:

	https://80x24.org/mirrors/git.git

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git