git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* [PATCH v2] Custom compression levels for objects and packs
@ 2007-05-08 22:38 Dana How
  2007-05-08 23:56 ` Junio C Hamano
                   ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Dana How @ 2007-05-08 22:38 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git Mailing List, danahow


Add config variables pack.compression and core.loosecompression .
Loose objects will be compressed using level
  isset(core.loosecompression) ? core.loosecompression :
  isset(core.compression) ? core.compression : Z_BEST_SPEED
and objects in packs will be compressed using level
  isset(pack.compression) ? pack.compression :
  isset(core.compression) ? core.compression : Z_DEFAULT_COMPRESSION
pack-objects also accepts --compression=N which
overrides the latter expression.

This applies on top of the git-repack --max-pack-size patchset.

Signed-off-by: Dana L. How <danahow@gmail.com>
---
 builtin-pack-objects.c |   32 +++++++++++++++++++++++++++++++-
 cache.h                |    2 ++
 config.c               |   18 +++++++++++++++++-
 environment.c          |    4 +++-
 4 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 8824793..e80a1d6 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -70,6 +70,8 @@ static uint32_t pack_size_limit;
 static int pack_to_stdout;
 static int num_preferred_base;
 static struct progress progress_state;
+static int pack_compression_level = Z_DEFAULT_COMPRESSION;
+static int pack_compression_seen;
 
 /*
  * The object names in objects array are hashed with this hashtable,
@@ -444,6 +446,10 @@ static unsigned long write_object(struct sha1file *f,
 				 * and we do not need to deltify it.
 				 */
 
+	/* differing core & pack compression when loose object -> must recompress */
+	if (!entry->in_pack && pack_compression_level != zlib_compression_level)
+		to_reuse = 0;
+	else
 	if (!entry->in_pack && !entry->delta) {
 		unsigned char *map;
 		unsigned long mapsize;
@@ -492,7 +498,7 @@ static unsigned long write_object(struct sha1file *f,
 		}
 		/* compress the data to store and put compressed length in datalen */
 		memset(&stream, 0, sizeof(stream));
-		deflateInit(&stream, zlib_compression_level);
+		deflateInit(&stream, pack_compression_level);
 		maxsize = deflateBound(&stream, size);
 		out = xmalloc(maxsize);
 		/* Compress it */
@@ -1624,6 +1630,16 @@ static int git_pack_config(const char *k, const char *v)
 		window = git_config_int(k, v);
 		return 0;
 	}
+	if (!strcmp(k, "pack.compression")) {
+		int level = git_config_int(k, v);
+		if (level == -1)
+			level = Z_DEFAULT_COMPRESSION;
+		else if (level < 0 || level > Z_BEST_COMPRESSION)
+			die("bad pack compression level %d", level);
+		pack_compression_level = level;
+		pack_compression_seen = 1;
+		return 0;
+	}
 	return git_default_config(k, v);
 }
 
@@ -1734,6 +1750,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	rp_ac = 2;
 
 	git_config(git_pack_config);
+	if (!pack_compression_seen && core_compression_seen)
+		pack_compression_level = core_compression_level;
 
 	progress = isatty(2);
 	for (i = 1; i < argc; i++) {
@@ -1761,6 +1779,18 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 				usage(pack_usage);
 			continue;
 		}
+		if (!prefixcmp(arg, "--compression=")) {
+			char *end;
+			int level = strtoul(arg+14, &end, 0);
+			if (!arg[14] || *end)
+				usage(pack_usage);
+			if (level == -1)
+				level = Z_DEFAULT_COMPRESSION;
+			else if (level < 0 || level > Z_BEST_COMPRESSION)
+				die("bad pack compression level %d", level);
+			pack_compression_level = level;
+			continue;
+		}
 		if (!prefixcmp(arg, "--window=")) {
 			char *end;
 			window = strtoul(arg+9, &end, 0);
diff --git a/cache.h b/cache.h
index 8e76152..2b3f359 100644
--- a/cache.h
+++ b/cache.h
@@ -283,6 +283,8 @@ extern int warn_ambiguous_refs;
 extern int shared_repository;
 extern const char *apply_default_whitespace;
 extern int zlib_compression_level;
+extern int core_compression_level;
+extern int core_compression_seen;
 extern size_t packed_git_window_size;
 extern size_t packed_git_limit;
 extern size_t delta_base_cache_limit;
diff --git a/config.c b/config.c
index 70d1055..5627ed6 100644
--- a/config.c
+++ b/config.c
@@ -12,6 +12,8 @@
 static FILE *config_file;
 static const char *config_file_name;
 static int config_linenr;
+static int zlib_compression_seen;
+
 static int get_next_char(void)
 {
 	int c;
@@ -304,13 +306,27 @@ int git_default_config(const char *var, const char *value)
 		return 0;
 	}
 
-	if (!strcmp(var, "core.compression")) {
+	if (!strcmp(var, "core.loosecompression")) {
 		int level = git_config_int(var, value);
 		if (level == -1)
 			level = Z_DEFAULT_COMPRESSION;
 		else if (level < 0 || level > Z_BEST_COMPRESSION)
 			die("bad zlib compression level %d", level);
 		zlib_compression_level = level;
+		zlib_compression_seen = 1;
+		return 0;
+	}
+
+	if (!strcmp(var, "core.compression")) {
+		int level = git_config_int(var, value);
+		if (level == -1)
+			level = Z_DEFAULT_COMPRESSION;
+		else if (level < 0 || level > Z_BEST_COMPRESSION)
+			die("bad zlib compression level %d", level);
+		core_compression_level = level;
+		core_compression_seen = 1;
+		if (!zlib_compression_seen)
+			zlib_compression_level = level;
 		return 0;
 	}
 
diff --git a/environment.c b/environment.c
index 2231659..b7aeb1a 100644
--- a/environment.c
+++ b/environment.c
@@ -24,7 +24,9 @@ const char *git_commit_encoding;
 const char *git_log_output_encoding;
 int shared_repository = PERM_UMASK;
 const char *apply_default_whitespace;
-int zlib_compression_level = Z_DEFAULT_COMPRESSION;
+int zlib_compression_level = Z_BEST_SPEED;
+int core_compression_level;
+int core_compression_seen;
 size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
 size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
 size_t delta_base_cache_limit = 16 * 1024 * 1024;
-- 
1.5.2.rc0.787.g0014

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-08 22:38 [PATCH v2] Custom compression levels for objects and packs Dana How
@ 2007-05-08 23:56 ` Junio C Hamano
  2007-05-09  0:16   ` Nicolas Pitre
  2007-05-09  0:25   ` Dana How
  2007-05-09  0:30 ` Petr Baudis
  2007-05-09 13:56 ` Theodore Tso
  2 siblings, 2 replies; 23+ messages in thread
From: Junio C Hamano @ 2007-05-08 23:56 UTC (permalink / raw)
  To: Dana How; +Cc: Git Mailing List

Dana How <danahow@gmail.com> writes:

> Add config variables pack.compression and core.loosecompression .
> Loose objects will be compressed using level
>   isset(core.loosecompression) ? core.loosecompression :
>   isset(core.compression) ? core.compression : Z_BEST_SPEED
> and objects in packs will be compressed using level
>   isset(pack.compression) ? pack.compression :
>   isset(core.compression) ? core.compression : Z_DEFAULT_COMPRESSION
> pack-objects also accepts --compression=N which
> overrides the latter expression.

Do you think the above is readable?

  Compression level for loose objects is controlled by variable
  core.loosecompression (or core.compression, if the former is
  missing), and defaults to best-speed.

or something like that?

> This applies on top of the git-repack --max-pack-size patchset.

Hmph, that makes the --max-pack-size patchset take this more
trivial and straightforward improvements hostage.  In general,
I'd prefer more elaborate ones based on less questionable
series.

> @@ -444,6 +446,10 @@ static unsigned long write_object(struct sha1file *f,
>  				 * and we do not need to deltify it.
>  				 */
>  
> +	/* differing core & pack compression when loose object -> must recompress */
> +	if (!entry->in_pack && pack_compression_level != zlib_compression_level)
> +		to_reuse = 0;
> +	else

I am not sure if that is worth it, as you do not know if the
loose object you are looking at were compressed with the current
settings.

> diff --git a/cache.h b/cache.h
> index 8e76152..2b3f359 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -283,6 +283,8 @@ extern int warn_ambiguous_refs;
>  extern int shared_repository;
>  extern const char *apply_default_whitespace;
>  extern int zlib_compression_level;
> +extern int core_compression_level;
> +extern int core_compression_seen;

Could we somehow remove _seen?  Perhaps by initializing the
_level to -1?

> +int core_compression_level;
> +int core_compression_seen;

Same here.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-08 23:56 ` Junio C Hamano
@ 2007-05-09  0:16   ` Nicolas Pitre
  2007-05-09  0:29     ` Dana How
  2007-05-09  0:25   ` Dana How
  1 sibling, 1 reply; 23+ messages in thread
From: Nicolas Pitre @ 2007-05-09  0:16 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Dana How, Git Mailing List

On Tue, 8 May 2007, Junio C Hamano wrote:

> Dana How <danahow@gmail.com> writes:
> 
> > @@ -444,6 +446,10 @@ static unsigned long write_object(struct sha1file *f,
> >  				 * and we do not need to deltify it.
> >  				 */
> >  
> > +	/* differing core & pack compression when loose object -> must recompress */
> > +	if (!entry->in_pack && pack_compression_level != zlib_compression_level)
> > +		to_reuse = 0;
> > +	else
> 
> I am not sure if that is worth it, as you do not know if the
> loose object you are looking at were compressed with the current
> settings.

I was about to make the same comment.

> > diff --git a/cache.h b/cache.h
> > index 8e76152..2b3f359 100644
> > --- a/cache.h
> > +++ b/cache.h
> > @@ -283,6 +283,8 @@ extern int warn_ambiguous_refs;
> >  extern int shared_repository;
> >  extern const char *apply_default_whitespace;
> >  extern int zlib_compression_level;
> > +extern int core_compression_level;
> > +extern int core_compression_seen;
> 
> Could we somehow remove _seen?  Perhaps by initializing the
> _level to -1?

-1 is a valid value for compression.  Actually it is equivalent to 
Z_DEFAULT_COMPRESSION.

If we want the fallback logic to work, at some point we must remember if 
the current value is the default or if it is the result of an explicit 
config option.


Nicolas

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-08 23:56 ` Junio C Hamano
  2007-05-09  0:16   ` Nicolas Pitre
@ 2007-05-09  0:25   ` Dana How
  2007-05-09  1:23     ` Nicolas Pitre
  2007-05-09  5:59     ` [PATCH v2] Custom compression levels for objects and packs Junio C Hamano
  1 sibling, 2 replies; 23+ messages in thread
From: Dana How @ 2007-05-09  0:25 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git Mailing List, danahow

On 5/8/07, Junio C Hamano <junkio@cox.net> wrote:
> Dana How <danahow@gmail.com> writes:
> > Add config variables pack.compression and core.loosecompression .
> > Loose objects will be compressed using level
> >   isset(core.loosecompression) ? core.loosecompression :
> >   isset(core.compression) ? core.compression : Z_BEST_SPEED
> > and objects in packs will be compressed using level
> >   isset(pack.compression) ? pack.compression :
> >   isset(core.compression) ? core.compression : Z_DEFAULT_COMPRESSION
> > pack-objects also accepts --compression=N which
> > overrides the latter expression.
>
> Do you think the above is readable?
>   Compression level for loose objects is controlled by variable
>   core.loosecompression (or core.compression, if the former is
>   missing), and defaults to best-speed.
> or something like that?
Your phrasing is much better.

> > This applies on top of the git-repack --max-pack-size patchset.
> Hmph, that makes the --max-pack-size patchset take this more
> trivial and straightforward improvements hostage.  In general,
> I'd prefer more elaborate ones based on less questionable
> series.
The max-pack-size and pack.compression patches touch the same lines.
I thought my options were:
* Submit independently and make you merge; or
* Make one precede the other.
Since max-pack-size has been out there since April 4 and
the first acceptable version was May 1 (suggested by 0 comments),
I didn't realize it was a "questionable series".

I think it should be straightforward for me to re-submit this
based on current master.

> > +     /* differing core & pack compression when loose object -> must recompress */
> > +     if (!entry->in_pack && pack_compression_level != zlib_compression_level)
> > +             to_reuse = 0;
> > +     else
> I am not sure if that is worth it, as you do not know if the
> loose object you are looking at were compressed with the current
> settings.
You do not know for certain, that is correct.  However, config
settings setting unequal compression levels signal that you
care differently about the two cases. (For me,  I want the
compression investment to correspond to the expected lifetime of the file.)
Also,  *if* we have the knobs we want in the config file,
I don't think we're going to be changing these settings all that often.

If I didn't have this check forcing recompression in the pack,
then in the absence of deltification each object would enter the pack
by being copied (in the preceding code block) and pack.compression
would have little effect.  I actually experienced this the very first
time I imported a large dataset into git (I was trying to achieve the
effect of this patch by changing core.compression dynamically,  and
was a bit mystified for a while by the result).

Thus,  if core.loosecompression is set to speed up git-add,  I should
take the time to recompress the object when packing if pack.compression
is different (of course the hit of not doing so will be lessened by
deltification
which forces a new compression).

> > diff --git a/cache.h b/cache.h
> > index 8e76152..2b3f359 100644
> > --- a/cache.h
> > +++ b/cache.h
> > @@ -283,6 +283,8 @@ extern int warn_ambiguous_refs;
> >  extern int shared_repository;
> >  extern const char *apply_default_whitespace;
> >  extern int zlib_compression_level;
> > +extern int core_compression_level;
> > +extern int core_compression_seen;
>
> Could we somehow remove _seen?  Perhaps by initializing the
> _level to -1?
>
> > +int core_compression_level;
> > +int core_compression_seen;
>
> Same here.
I agree completely.  But,  what magic value should I use
to initialize the _level variables so I know they are not set?
All valid settings come from zlib.h through #define's but
there is no "invalid" defined.  Maybe I'll use -99.

Thanks,
-- 
Dana L. How  danahow@gmail.com  +1 650 804 5991 cell

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-09  0:16   ` Nicolas Pitre
@ 2007-05-09  0:29     ` Dana How
  2007-05-09  1:03       ` Nicolas Pitre
  0 siblings, 1 reply; 23+ messages in thread
From: Dana How @ 2007-05-09  0:29 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, Git Mailing List, danahow

On 5/8/07, Nicolas Pitre <nico@cam.org> wrote:
> On Tue, 8 May 2007, Junio C Hamano wrote:
> > Dana How <danahow@gmail.com> writes:
> > > +   /* differing core & pack compression when loose object -> must recompress */
> > > +   if (!entry->in_pack && pack_compression_level != zlib_compression_level)
> > > +           to_reuse = 0;
> > > +   else
> > I am not sure if that is worth it, as you do not know if the
> > loose object you are looking at were compressed with the current
> > settings.
> I was about to make the same comment.
I was bitten by *not* doing this.  Please see the more verbose
reply to Junio's comment.

> > Could we somehow remove _seen?  Perhaps by initializing the
> > _level to -1?
>
> -1 is a valid value for compression.  Actually it is equivalent to
> Z_DEFAULT_COMPRESSION.
>
> If we want the fallback logic to work, at some point we must remember if
> the current value is the default or if it is the result of an explicit
> config option.
I can leave as-is, or use a magic value like -99 and
depend on it not colliding with values in zlib.h.

Thanks,
-- 
Dana L. How  danahow@gmail.com  +1 650 804 5991 cell

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-08 22:38 [PATCH v2] Custom compression levels for objects and packs Dana How
  2007-05-08 23:56 ` Junio C Hamano
@ 2007-05-09  0:30 ` Petr Baudis
  2007-05-09 13:56 ` Theodore Tso
  2 siblings, 0 replies; 23+ messages in thread
From: Petr Baudis @ 2007-05-09  0:30 UTC (permalink / raw)
  To: Dana How; +Cc: Junio C Hamano, Git Mailing List

On Wed, May 09, 2007 at 12:38:22AM CEST, Dana How wrote:
> diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
> index 8824793..e80a1d6 100644
> --- a/builtin-pack-objects.c
> +++ b/builtin-pack-objects.c
> @@ -444,6 +446,10 @@ static unsigned long write_object(struct sha1file *f,
>  				 * and we do not need to deltify it.
>  				 */
>  
> +	/* differing core & pack compression when loose object -> must recompress */
> +	if (!entry->in_pack && pack_compression_level != zlib_compression_level)
> +		to_reuse = 0;
> +	else
>  	if (!entry->in_pack && !entry->delta) {

Style: the else and if should be probably on the same line.

>  		unsigned char *map;
>  		unsigned long mapsize;
> @@ -1624,6 +1630,16 @@ static int git_pack_config(const char *k, const char *v)
>  		window = git_config_int(k, v);
>  		return 0;
>  	}
> +	if (!strcmp(k, "pack.compression")) {
> +		int level = git_config_int(k, v);
> +		if (level == -1)
> +			level = Z_DEFAULT_COMPRESSION;
> +		else if (level < 0 || level > Z_BEST_COMPRESSION)
> +			die("bad pack compression level %d", level);
> +		pack_compression_level = level;
> +		pack_compression_seen = 1;
> +		return 0;
> +	}
>  	return git_default_config(k, v);
>  }
>  

Where is this documented?

> @@ -1761,6 +1779,18 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
>  				usage(pack_usage);
>  			continue;
>  		}
> +		if (!prefixcmp(arg, "--compression=")) {
> +			char *end;
> +			int level = strtoul(arg+14, &end, 0);
> +			if (!arg[14] || *end)
> +				usage(pack_usage);
> +			if (level == -1)
> +				level = Z_DEFAULT_COMPRESSION;
> +			else if (level < 0 || level > Z_BEST_COMPRESSION)
> +				die("bad pack compression level %d", level);
> +			pack_compression_level = level;
> +			continue;
> +		}
>  		if (!prefixcmp(arg, "--window=")) {
>  			char *end;
>  			window = strtoul(arg+9, &end, 0);

Where is this documented?

> diff --git a/config.c b/config.c
> index 70d1055..5627ed6 100644
> --- a/config.c
> +++ b/config.c
> @@ -304,13 +306,27 @@ int git_default_config(const char *var, const char *value)
>  		return 0;
>  	}
>  
> -	if (!strcmp(var, "core.compression")) {
> +	if (!strcmp(var, "core.loosecompression")) {

Is this config variable documented?

-- 
				Petr "Pasky" Baudis
Stuff: http://pasky.or.cz/
Ever try. Ever fail. No matter. // Try again. Fail again. Fail better.
		-- Samuel Beckett

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-09  0:29     ` Dana How
@ 2007-05-09  1:03       ` Nicolas Pitre
  2007-05-09  6:46         ` Dana How
  0 siblings, 1 reply; 23+ messages in thread
From: Nicolas Pitre @ 2007-05-09  1:03 UTC (permalink / raw)
  To: Dana How; +Cc: Junio C Hamano, Git Mailing List

On Tue, 8 May 2007, Dana How wrote:

> On 5/8/07, Nicolas Pitre <nico@cam.org> wrote:
> > On Tue, 8 May 2007, Junio C Hamano wrote:
> > If we want the fallback logic to work, at some point we must remember if
> > the current value is the default or if it is the result of an explicit
> > config option.
> I can leave as-is, or use a magic value like -99 and
> depend on it not colliding with values in zlib.h.

And where would you set those variables to a sensible default in the 
absence of any config option?


Nicolas

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-09  0:25   ` Dana How
@ 2007-05-09  1:23     ` Nicolas Pitre
  2007-05-09  9:21       ` Dana How
  2007-05-09  5:59     ` [PATCH v2] Custom compression levels for objects and packs Junio C Hamano
  1 sibling, 1 reply; 23+ messages in thread
From: Nicolas Pitre @ 2007-05-09  1:23 UTC (permalink / raw)
  To: Dana How; +Cc: Junio C Hamano, Git Mailing List

On Tue, 8 May 2007, Dana How wrote:

> Since max-pack-size has been out there since April 4 and
> the first acceptable version was May 1 (suggested by 0 comments),
> I didn't realize it was a "questionable series".
> 
> I think it should be straightforward for me to re-submit this
> based on current master.

Since this patch is simpler it could be merged much faster, before the 
pack limit series.

> > > +     /* differing core & pack compression when loose object -> must
> > recompress */
> > > +     if (!entry->in_pack && pack_compression_level !=
> > zlib_compression_level)
> > > +             to_reuse = 0;
> > > +     else
> > I am not sure if that is worth it, as you do not know if the
> > loose object you are looking at were compressed with the current
> > settings.
> You do not know for certain, that is correct.  However, config
> settings setting unequal compression levels signal that you
> care differently about the two cases. (For me,  I want the
> compression investment to correspond to the expected lifetime of the file.)
> Also,  *if* we have the knobs we want in the config file,
> I don't think we're going to be changing these settings all that often.
> 
> If I didn't have this check forcing recompression in the pack,
> then in the absence of deltification each object would enter the pack
> by being copied (in the preceding code block) and pack.compression
> would have little effect.  I actually experienced this the very first
> time I imported a large dataset into git (I was trying to achieve the
> effect of this patch by changing core.compression dynamically,  and
> was a bit mystified for a while by the result).
> 
> Thus,  if core.loosecompression is set to speed up git-add,  I should
> take the time to recompress the object when packing if pack.compression
> is different (of course the hit of not doing so will be lessened by
> deltification
> which forces a new compression).

Right.  And this also depends whether or not you have core.legacyheaders 
set to false or not.

And the whole purpose for setting core.legacyheaders is exactly to allow 
for loose objects to be copied straight into the pack.  This should have 
priority over mismatched compression levels IMHO.

Also, when repacking, delta reuse does not recompress objects for the 
same reason, regardless of the compression level used when they were 
compressed initially.  Same argument goes for delta depth.

So if you really want to ensure a compression level on the whole pack, 
you'll have to use -f with git-repack. Or leave core.legacyheaders 
unset.


Nicolas

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-09  0:25   ` Dana How
  2007-05-09  1:23     ` Nicolas Pitre
@ 2007-05-09  5:59     ` Junio C Hamano
  2007-05-09  6:24       ` Dana How
  1 sibling, 1 reply; 23+ messages in thread
From: Junio C Hamano @ 2007-05-09  5:59 UTC (permalink / raw)
  To: Dana How; +Cc: Git Mailing List

"Dana How" <danahow@gmail.com> writes:

> On 5/8/07, Junio C Hamano <junkio@cox.net> wrote:
>> Dana How <danahow@gmail.com> writes:
>> ...
>> > This applies on top of the git-repack --max-pack-size patchset.
>
>> Hmph, that makes the --max-pack-size patchset take this more
>> trivial and straightforward improvements hostage.  In general,
>> I'd prefer more elaborate ones based on less questionable
>> series.
>
> The max-pack-size and pack.compression patches touch the same lines.
> I thought my options were:
> * Submit independently and make you merge; or
> * Make one precede the other.
> Since max-pack-size has been out there since April 4 and
> the first acceptable version was May 1 (suggested by 0 comments),
> I didn't realize it was a "questionable series".

No, what I meant was that it is much "more elaborate" series
than this custom compression which is much "less questionable".

I think this custom compression is 1.5.2 material.  I have not
studied the code for the max-pack-size enough to be confident to
put it in 1.5.2, at least not yet, and was planning to park the
latter in 'next' until 1.5.2 final.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-09  5:59     ` [PATCH v2] Custom compression levels for objects and packs Junio C Hamano
@ 2007-05-09  6:24       ` Dana How
  0 siblings, 0 replies; 23+ messages in thread
From: Dana How @ 2007-05-09  6:24 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git Mailing List, danahow, Nicolas Pitre

On 5/8/07, Junio C Hamano <junkio@cox.net> wrote:
> "Dana How" <danahow@gmail.com> writes:
> > On 5/8/07, Junio C Hamano <junkio@cox.net> wrote:
> >> Dana How <danahow@gmail.com> writes:
> >> > This applies on top of the git-repack --max-pack-size patchset.
> >> Hmph, that makes the --max-pack-size patchset take this more
> >> trivial and straightforward improvements hostage.  In general,
> >> I'd prefer more elaborate ones based on less questionable
> >> series.
> > The max-pack-size and pack.compression patches touch the same lines.
> > I thought my options were:
> > * Submit independently and make you merge; or
> > * Make one precede the other.
> > Since max-pack-size has been out there since April 4 and
> > the first acceptable version was May 1 (suggested by 0 comments),
> > I didn't realize it was a "questionable series".
> No, what I meant was that it is much "more elaborate" series
> than this custom compression which is much "less questionable".
>
> I think this custom compression is 1.5.2 material.  I have not
> studied the code for the max-pack-size enough to be confident to
> put it in 1.5.2, at least not yet, and was planning to park the
> latter in 'next' until 1.5.2 final.
OK, thanks for guesstimating the overall schedule.  I was starting to wonder
what the next step(s) should be.

I will incorporate your & Nicolas's comments and send out a new custom
compression patch tomorrow.  I *think* I addressed everyone's comments
on max-pack-size, but let me know if you find anything else when you get
around to it.

Thanks,
-- 
Dana L. How  danahow@gmail.com  +1 650 804 5991 cell

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-09  1:03       ` Nicolas Pitre
@ 2007-05-09  6:46         ` Dana How
  2007-05-09  7:13           ` Junio C Hamano
  0 siblings, 1 reply; 23+ messages in thread
From: Dana How @ 2007-05-09  6:46 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, Git Mailing List, danahow

On 5/8/07, Nicolas Pitre <nico@cam.org> wrote:
> On Tue, 8 May 2007, Dana How wrote:
> > On 5/8/07, Nicolas Pitre <nico@cam.org> wrote:
> > > On Tue, 8 May 2007, Junio C Hamano wrote:
> > > If we want the fallback logic to work, at some point we must remember if
> > > the current value is the default or if it is the result of an explicit
> > > config option.
> > I can leave as-is, or use a magic value like -99 and
> > depend on it not colliding with values in zlib.h.
> And where would you set those variables to a sensible default in the
> absence of any config option?
This is why I used the _seen variables,
but they are not necessary -- see 3rd option below.

Unfortunately we agreed a day or two ago to use a config rule like
used_value = isset(var1) ? var1 : isset(var2) ? var2 : DEFAULT.
This doesn't interact well with each variable being processed
completely independently in git_config() and the callbacks it calls.
The isset() value is "out-of-band"; either store it in the _seen
variables, or some special value in used_value .

Which makes the most sense:
* Leave _seen as-is;
* Move pack.compression recognition into config.c which means
  the _seen variables would all be local to config.c;
* Use some special value, and if still present replace it with the default
  at the end of git_config() using extra code;
* Change the config rule to something simpler.

I like the 2nd and the 4th.  You didn't like the 4th.
Shall I change to the 2nd?

Thanks,
-- 
Dana L. How  danahow@gmail.com  +1 650 804 5991 cell

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-09  6:46         ` Dana How
@ 2007-05-09  7:13           ` Junio C Hamano
  0 siblings, 0 replies; 23+ messages in thread
From: Junio C Hamano @ 2007-05-09  7:13 UTC (permalink / raw)
  To: Dana How; +Cc: Nicolas Pitre, Git Mailing List

"Dana How" <danahow@gmail.com> writes:

> This doesn't interact well with each variable being processed
> completely independently in git_config() and the callbacks it calls.
> The isset() value is "out-of-band"; either store it in the _seen
> variables, or some special value in used_value .
>
> Which makes the most sense:
> * Leave _seen as-is;
> * Move pack.compression recognition into config.c which means
>  the _seen variables would all be local to config.c;
> * Use some special value, and if still present replace it with the default
>  at the end of git_config() using extra code;
> * Change the config rule to something simpler.
>
> I like the 2nd and the 4th.  You didn't like the 4th.
> Shall I change to the 2nd?

FWIW, I am Ok with (1).

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-09  1:23     ` Nicolas Pitre
@ 2007-05-09  9:21       ` Dana How
  2007-05-09 15:27         ` Nicolas Pitre
  0 siblings, 1 reply; 23+ messages in thread
From: Dana How @ 2007-05-09  9:21 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, Git Mailing List, danahow

On 5/8/07, Nicolas Pitre <nico@cam.org> wrote:
> On Tue, 8 May 2007, Dana How wrote:
> > I think it should be straightforward for me to re-submit this
> > based on current master.
> Since this patch is simpler it could be merged much faster, before the
> pack limit series.
Yes,  I will do "custom compression" patch first.

> > > > +     /* differing core & pack compression when loose object -> must
> > > recompress */
> > > I am not sure if that is worth it, as you do not know if the
> > > loose object you are looking at were compressed with the current
> > > settings.
> > You do not know for certain, that is correct.  However, config
> > settings setting unequal compression levels signal that you
> > care differently about the two cases. (For me,  I want the
> > compression investment to correspond to the expected lifetime of the file.)
> > Also,  *if* we have the knobs we want in the config file,
> > I don't think we're going to be changing these settings all that often.
> >
> > If I didn't have this check forcing recompression in the pack,
> > then in the absence of deltification each object would enter the pack
> > by being copied (in the preceding code block) and pack.compression
> > would have little effect.  I actually experienced this the very first
> > time I imported a large dataset into git (I was trying to achieve the
> > effect of this patch by changing core.compression dynamically,  and
> > was a bit mystified for a while by the result).
> >
> > Thus,  if core.loosecompression is set to speed up git-add,  I should
> > take the time to recompress the object when packing if pack.compression
> > is different (of course the hit of not doing so will be lessened by
> > deltification
> > which forces a new compression).
>
> Right.  And this also depends whether or not you have core.legacyheaders
> set to false or not.
>
> And the whole purpose for setting core.legacyheaders is exactly to allow
> for loose objects to be copied straight into the pack.  This should have
> priority over mismatched compression levels IMHO.
OK,  I got really confused here, so I looked over the code,
and figured out 2 causes for my confusion.
(1) core.legacyheaders controls use_legacy_headers, which defaults to 1.
So currently all loose objects are in legacy format and the code block
I spoke of doesn't trigger [without a config setting].  I didn't realize
legacy headers were still being produced (mislead by the name!).
(2) I read your "setting core.legacyheaders" as followed by TRUE,
but you meant FALSE, which is not the default.

I also read that 1 year after 1.4.2, the default for core.legacyheaders is going
to change to FALSE.  I think our discussion should assume this has
happened.  So let's assume FALSE in the following.  The point of that
is that such a FALSE setting can't be assumed to have any special intent;
it will be the default.

[Everything I write here boils down to only one question,
 which I repeat at the end.]

Data gets into a pack in these ways:
1. Loose object copied in;
2. Loose object newly deltified;
3. Packed object to be copied;
4. Packed object to be newly deltified;
5. Packed deltified object we can't re-use;
6. Packed deltified object we can re-use.
["copied" includes recompressed.]

In (2), (4), and (5), pack.compression will always be newly used.
If pack.compression doesn't change,  this means (6)
will be using pack.compression since it comes from (2) or (4).
So if I "guarantee" that (1) uses pack.compression,
(3) will as well, meaning everything in the pack will be
at pack.compression.

Thus if pack.compression != core.loosecompression takes precedence
over core.legacyheaders = false,  then for pack.compression constant
we get all 6 cases at level pack.compression.  If core.legacyheaders =
false takes precedence as you suggest,  then all undeltified objects
(20%?) will be stuck at core.loosecompression [since I see no way
to sensibly re-apply compression to something copied pack-to-pack].
I think this is inconsistent with what a pack.compression !=
core.loosecompression setting is telling us.

My arguments have 2 biases.
(a) I assume pack.compression stays constant, and if it changes,
I see little value in worrying about "forcing" all objects in a new pack,
some of which might be copied from an old pack, to be at the same
compression level.
(b) I focused first on packing to disk,  where a constant pack.compression
makes sense. For packing to stdout (pack transfer),  especially on
a popular, hence loaded, repository server, we may want to use
--compress=N to crank down on effort spent compressing new deltas [2+4]
(or loose objects [1]) (or unusable deltas [5]).  This still lets
better compressed
objects flow through in the other cases [3+6].  So for one-use packs,
a mix of compression levels could be desirable and
is achievable with --compression=N if/when so.

> Also, when repacking, delta reuse does not recompress objects for the
> same reason, regardless of the compression level used when they were
> compressed initially.  Same argument goes for delta depth.
Delta reuse doesn't need recompression.  For pack.compression
constant,  they will already be at the correct level.  I think we agree
on behavior here for different reasons.

> So if you really want to ensure a compression level on the whole pack,
> you'll have to use -f with git-repack. Or leave core.legacyheaders
> unset.
-f would be needed if you were in the practice of changing
pack.compression, yes.

So, after covering all these cases, have I convinced you that
recompression takes precedence over legacyheaders = FALSE?

Thanks,
-- 
Dana L. How  danahow@gmail.com  +1 650 804 5991 cell

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-08 22:38 [PATCH v2] Custom compression levels for objects and packs Dana How
  2007-05-08 23:56 ` Junio C Hamano
  2007-05-09  0:30 ` Petr Baudis
@ 2007-05-09 13:56 ` Theodore Tso
  2007-05-09 16:44   ` Dana How
  2 siblings, 1 reply; 23+ messages in thread
From: Theodore Tso @ 2007-05-09 13:56 UTC (permalink / raw)
  To: Dana How; +Cc: Junio C Hamano, Git Mailing List

I noticed that the patch didn't include additions to
Documentation/config.txt; could those be added, so that as much as
possible all of the various configuration knobs are documented in one
place, please?

Thanks!!

					- Ted

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-09  9:21       ` Dana How
@ 2007-05-09 15:27         ` Nicolas Pitre
  2007-05-09 16:26           ` Junio C Hamano
  0 siblings, 1 reply; 23+ messages in thread
From: Nicolas Pitre @ 2007-05-09 15:27 UTC (permalink / raw)
  To: Dana How; +Cc: Junio C Hamano, Git Mailing List

On Wed, 9 May 2007, Dana How wrote:

> OK,  I got really confused here, so I looked over the code,
> and figured out 2 causes for my confusion.
> (1) core.legacyheaders controls use_legacy_headers, which defaults to 1.
> So currently all loose objects are in legacy format and the code block
> I spoke of doesn't trigger [without a config setting].  I didn't realize
> legacy headers were still being produced (mislead by the name!).
> (2) I read your "setting core.legacyheaders" as followed by TRUE,
> but you meant FALSE, which is not the default.
> 
> I also read that 1 year after 1.4.2, the default for core.legacyheaders is
> going
> to change to FALSE.  I think our discussion should assume this has
> happened.

<tangential comment>

Now that we encourage and actively preserve objects in a packed form 
more agressively than we did at the time core.legacyheaders was 
introduced, I really wonder if this option is still worth it.  Because 
the packing of loose objects has to go through the delta match loop 
anyway, and since most of them should end up being deltified in most 
cases, there is really little advantage to have this parallel loose 
object format as the CPU savings it might provide is rather lost in the 
noise in the end.  

So I suggest that we get rid of core.legacyheaders, preserve the legacy 
format as the only writable loose object format and deprecate the other 
one to keep things simpler.  Thoughts?

</tangential comment>

> So let's assume FALSE in the following.  The point of that is that 
> such a FALSE setting can't be assumed to have any special intent; it 
> will be the default.
> 
> [Everything I write here boils down to only one question,
> which I repeat at the end.]
> 
> Data gets into a pack in these ways:
> 1. Loose object copied in;
> 2. Loose object newly deltified;
> 3. Packed object to be copied;
> 4. Packed object to be newly deltified;
> 5. Packed deltified object we can't re-use;
> 6. Packed deltified object we can re-use.
> ["copied" includes recompressed.]

I think you forgets "packed undeltified objects we can reuse".

> In (2), (4), and (5), pack.compression will always be newly used.
> If pack.compression doesn't change,  this means (6)
> will be using pack.compression since it comes from (2) or (4).
> So if I "guarantee" that (1) uses pack.compression,
> (3) will as well, meaning everything in the pack will be
> at pack.compression.
> 
> Thus if pack.compression != core.loosecompression takes precedence
> over core.legacyheaders = false,  then for pack.compression constant
> we get all 6 cases at level pack.compression.  If core.legacyheaders =
> false takes precedence as you suggest,  then all undeltified objects
> (20%?) will be stuck at core.loosecompression [since I see no way
> to sensibly re-apply compression to something copied pack-to-pack].
> 
> I think this is inconsistent with what a pack.compression !=
> core.loosecompression setting is telling us.

OK I see that I missed the fact that git-repack -f (or git-pack-objects 
--no-reuse-delta) does not recompress undeltified objects.  Note this is 
a problem in the case where you change pack.compression to a different 
value or override it on the command line as well: reused undeltified 
objects won't get recompressed with the new level.  My rant on 
core.legacyheaders and its removal would address the first case.  Your 
test for a difference between loose and packed compression levels is 
flawed because the value of core.compression does not necessarily 
represent the compression level that was used for the loose objects to 
pack (core.compression might have been modified since then), but it 
only addresses the first case too.  And this is a problem even now.

What we need instead is a --no-reuse-object that would force 
recompression of everything when you really want to enforce a specific 
compression level across the whole pack(s).


Nicolas

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-09 15:27         ` Nicolas Pitre
@ 2007-05-09 16:26           ` Junio C Hamano
  2007-05-09 16:42             ` Dana How
                               ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Junio C Hamano @ 2007-05-09 16:26 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Dana How, Git Mailing List

Nicolas Pitre <nico@cam.org> writes:

> So I suggest that we get rid of core.legacyheaders, preserve the legacy 
> format as the only writable loose object format and deprecate the other 
> one to keep things simpler.  Thoughts?

I agree with your analysis, especially when deeper delta chains
are allowed, straight copy of loose object becomes less and less
likely.

> What we need instead is a --no-reuse-object that would force 
> recompression of everything when you really want to enforce a specific 
> compression level across the whole pack(s).

Yeah.  Or maybe --no-reuse to mean both and make '-f' a
short-hand synonym for that.

I do not see much reason to want to tweak them independently;
recomputing delta is much more expensive than recompressing
anyway, and when the user says 'repack -f', it is a sign that
the user is willing to spend CPU cycles.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-09 16:26           ` Junio C Hamano
@ 2007-05-09 16:42             ` Dana How
  2007-05-09 16:59             ` [PATCH] make "repack -f" imply "pack-objects --no-reuse-object" Nicolas Pitre
  2007-05-09 18:42             ` [PATCH] deprecate the new loose object header format Nicolas Pitre
  2 siblings, 0 replies; 23+ messages in thread
From: Dana How @ 2007-05-09 16:42 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nicolas Pitre, Git Mailing List, danahow

On 5/9/07, Junio C Hamano <junkio@cox.net> wrote:
> Nicolas Pitre <nico@cam.org> writes:
>
> > So I suggest that we get rid of core.legacyheaders, preserve the legacy
> > format as the only writable loose object format and deprecate the other
> > one to keep things simpler.  Thoughts?
>
> I agree with your analysis, especially when deeper delta chains
> are allowed, straight copy of loose object becomes less and less
> likely.
I too agree with the logic in the original <tangential comment>,
and deprecation is the appropriate thing.  So I'm not actually
going to change anything here in today's patch.

> > What we need instead is a --no-reuse-object that would force
> > recompression of everything when you really want to enforce a specific
> > compression level across the whole pack(s).
>
> Yeah.  Or maybe --no-reuse to mean both and make '-f' a
> short-hand synonym for that.
>
> I do not see much reason to want to tweak them independently;
> recomputing delta is much more expensive than recompressing
> anyway, and when the user says 'repack -f', it is a sign that
> the user is willing to spend CPU cycles.
We have the same idea about git-repack -f -- it implies both
--no-reuse-delta and --no-reuse-object , and I will do that.
I will incorporate Nicolas's new --no-reuse-object
in my patch to builtin-pack-objects.c .
I won't go so far as collapsing --no-reuse-* to --no-reuse in this patch.

Thanks,
-- 
Dana L. How  danahow@gmail.com  +1 650 804 5991 cell

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Custom compression levels for objects and packs
  2007-05-09 13:56 ` Theodore Tso
@ 2007-05-09 16:44   ` Dana How
  0 siblings, 0 replies; 23+ messages in thread
From: Dana How @ 2007-05-09 16:44 UTC (permalink / raw)
  To: Theodore Tso; +Cc: Junio C Hamano, Git Mailing List, danahow

On 5/9/07, Theodore Tso <tytso@mit.edu> wrote:
> I noticed that the patch didn't include additions to
> Documentation/config.txt; could those be added, so that as much as
> possible all of the various configuration knobs are documented in one
> place, please?

I will update the documentation,  after we've stopped changing
the set of options and behaviors.

You can see that's just what I did in the max-pack-size patch.

Thanks,
-- 
Dana L. How  danahow@gmail.com  +1 650 804 5991 cell

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] make "repack -f" imply "pack-objects --no-reuse-object"
  2007-05-09 16:26           ` Junio C Hamano
  2007-05-09 16:42             ` Dana How
@ 2007-05-09 16:59             ` Nicolas Pitre
  2007-05-09 18:42             ` [PATCH] deprecate the new loose object header format Nicolas Pitre
  2 siblings, 0 replies; 23+ messages in thread
From: Nicolas Pitre @ 2007-05-09 16:59 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Dana How, Git Mailing List

As Junio said:

  Recomputing delta is much more expensive than recompressing
  anyway, and when the user says 'repack -f', it is a sign that
  the user is willing to spend CPU cycles.

Signed-off-by: Nicolas Pitre <nico@cam.org>
---

On Wed, 9 May 2007, Junio C Hamano wrote:

> Nicolas Pitre <nico@cam.org> writes:
> 
> > What we need instead is a --no-reuse-object that would force 
> > recompression of everything when you really want to enforce a specific 
> > compression level across the whole pack(s).
> 
> Yeah.  Or maybe --no-reuse to mean both and make '-f' a
> short-hand synonym for that.
> 
> I do not see much reason to want to tweak them independently;
> recomputing delta is much more expensive than recompressing
> anyway, and when the user says 'repack -f', it is a sign that
> the user is willing to spend CPU cycles.

This applies on top of my --no-reuse-object patch.  I think it is good 
to have the flexibility in the plumbing, but as you say git-repack 
might as well use the strongest option.

diff --git a/git-repack.sh b/git-repack.sh
index ddfa8b4..8bf66a4 100755
--- a/git-repack.sh
+++ b/git-repack.sh
@@ -8,7 +8,7 @@ SUBDIRECTORY_OK='Yes'
 . git-sh-setup
 
 no_update_info= all_into_one= remove_redundant=
-local= quiet= no_reuse_delta= extra=
+local= quiet= no_reuse= extra=
 while case "$#" in 0) break ;; esac
 do
 	case "$1" in
@@ -16,7 +16,7 @@ do
 	-a)	all_into_one=t ;;
 	-d)	remove_redundant=t ;;
 	-q)	quiet=-q ;;
-	-f)	no_reuse_delta=--no-reuse-delta ;;
+	-f)	no_reuse=--no-reuse-object ;;
 	-l)	local=--local ;;
 	--window=*) extra="$extra $1" ;;
 	--depth=*) extra="$extra $1" ;;
@@ -61,7 +61,7 @@ case ",$all_into_one," in
 	;;
 esac
 
-args="$args $local $quiet $no_reuse_delta$extra"
+args="$args $local $quiet $no_reuse$extra"
 name=$(git-pack-objects --non-empty --all --reflog $args </dev/null "$PACKTMP") ||
 	exit 1
 if [ -z "$name" ]; then

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH] deprecate the new loose object header format
  2007-05-09 16:26           ` Junio C Hamano
  2007-05-09 16:42             ` Dana How
  2007-05-09 16:59             ` [PATCH] make "repack -f" imply "pack-objects --no-reuse-object" Nicolas Pitre
@ 2007-05-09 18:42             ` Nicolas Pitre
  2007-05-09 20:16               ` Dana How
  2 siblings, 1 reply; 23+ messages in thread
From: Nicolas Pitre @ 2007-05-09 18:42 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Dana How, Git Mailing List

Now that we encourage and actively preserve objects in a packed form 
more agressively than we did at the time the new loose object format and 
core.legacyheaders were introduced, that extra loose object format 
doesn't appear to be worth it anymore.

Because the packing of loose objects has to go through the delta match 
loop anyway, and since most of them should end up being deltified in 
most cases, there is really little advantage to have this parallel loose 
object format as the CPU savings it might provide is rather lost in the 
noise in the end.

This patch gets rid of core.legacyheaders, preserve the legacy format as 
the only writable loose object format and deprecate the other one to 
keep things simpler.

Signed-off-by: Nicolas Pitre <nico@cam.org>
---

On Wed, 9 May 2007, Junio C Hamano wrote:

> I agree with your analysis, especially when deeper delta chains
> are allowed, straight copy of loose object becomes less and less
> likely.

So here it is, with a nice code reduction:

 Documentation/config.txt |   13 --------
 builtin-pack-objects.c   |   69 ----------------------------------------------
 cache.h                  |    2 -
 config.c                 |    5 ---
 environment.c            |    1 -
 sha1_file.c              |   47 +++++++------------------------
 6 files changed, 11 insertions(+), 126 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index ea434af..d6d89ba 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -209,19 +209,6 @@ core.compression::
 	compression, and 1..9 are various speed/size tradeoffs, 9 being
 	slowest.
 
-core.legacyheaders::
-	A boolean which
-	changes the format of loose objects so that they are more
-	efficient to pack and to send out of the repository over git
-	native protocol, since v1.4.2.  However, loose objects
-	written in the new format cannot be read by git older than
-	that version; people fetching from your repository using
-	older versions of git over dumb transports (e.g. http)
-	will also be affected.
-+
-To let git use the new loose object format, you have to
-set core.legacyheaders to false.
-
 core.packedGitWindowSize::
 	Number of bytes of a pack file to map into memory in a
 	single mapping operation.  Larger window sizes may allow
diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 966f843..c74a361 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -346,56 +346,6 @@ static void copy_pack_data(struct sha1file *f,
 	}
 }
 
-static int check_loose_inflate(unsigned char *data, unsigned long len, unsigned long expect)
-{
-	z_stream stream;
-	unsigned char fakebuf[4096];
-	int st;
-
-	memset(&stream, 0, sizeof(stream));
-	stream.next_in = data;
-	stream.avail_in = len;
-	stream.next_out = fakebuf;
-	stream.avail_out = sizeof(fakebuf);
-	inflateInit(&stream);
-
-	while (1) {
-		st = inflate(&stream, Z_FINISH);
-		if (st == Z_STREAM_END || st == Z_OK) {
-			st = (stream.total_out == expect &&
-			      stream.total_in == len) ? 0 : -1;
-			break;
-		}
-		if (st != Z_BUF_ERROR) {
-			st = -1;
-			break;
-		}
-		stream.next_out = fakebuf;
-		stream.avail_out = sizeof(fakebuf);
-	}
-	inflateEnd(&stream);
-	return st;
-}
-
-static int revalidate_loose_object(struct object_entry *entry,
-				   unsigned char *map,
-				   unsigned long mapsize)
-{
-	/* we already know this is a loose object with new type header. */
-	enum object_type type;
-	unsigned long size, used;
-
-	if (pack_to_stdout)
-		return 0;
-
-	used = unpack_object_header_gently(map, mapsize, &type, &size);
-	if (!used)
-		return -1;
-	map += used;
-	mapsize -= used;
-	return check_loose_inflate(map, mapsize, size);
-}
-
 static unsigned long write_object(struct sha1file *f,
 				  struct object_entry *entry)
 {
@@ -425,25 +375,6 @@ static unsigned long write_object(struct sha1file *f,
 				 * and we do not need to deltify it.
 				 */
 
-	if (!entry->in_pack && !entry->delta) {
-		unsigned char *map;
-		unsigned long mapsize;
-		map = map_sha1_file(entry->sha1, &mapsize);
-		if (map && !legacy_loose_object(map)) {
-			/* We can copy straight into the pack file */
-			if (revalidate_loose_object(entry, map, mapsize))
-				die("corrupt loose object %s",
-				    sha1_to_hex(entry->sha1));
-			sha1write(f, map, mapsize);
-			munmap(map, mapsize);
-			written++;
-			reused++;
-			return mapsize;
-		}
-		if (map)
-			munmap(map, mapsize);
-	}
-
 	if (!to_reuse) {
 		buf = read_sha1_file(entry->sha1, &type, &size);
 		if (!buf)
diff --git a/cache.h b/cache.h
index 8e76152..5725bce 100644
--- a/cache.h
+++ b/cache.h
@@ -273,7 +273,6 @@ extern void rollback_lock_file(struct lock_file *);
 extern int delete_ref(const char *, const unsigned char *sha1);
 
 /* Environment bits from configuration mechanism */
-extern int use_legacy_headers;
 extern int trust_executable_bit;
 extern int has_symlinks;
 extern int assume_unchanged;
@@ -354,7 +353,6 @@ extern int move_temp_to_file(const char *tmpfile, const char *filename);
 extern int has_sha1_pack(const unsigned char *sha1, const char **ignore);
 extern int has_sha1_file(const unsigned char *sha1);
 extern void *map_sha1_file(const unsigned char *sha1, unsigned long *);
-extern int legacy_loose_object(unsigned char *);
 
 extern int has_pack_file(const unsigned char *sha1);
 extern int has_pack_index(const unsigned char *sha1);
diff --git a/config.c b/config.c
index 70d1055..298966f 100644
--- a/config.c
+++ b/config.c
@@ -299,11 +299,6 @@ int git_default_config(const char *var, const char *value)
 		return 0;
 	}
 
-	if (!strcmp(var, "core.legacyheaders")) {
-		use_legacy_headers = git_config_bool(var, value);
-		return 0;
-	}
-
 	if (!strcmp(var, "core.compression")) {
 		int level = git_config_int(var, value);
 		if (level == -1)
diff --git a/environment.c b/environment.c
index 2231659..54e3aba 100644
--- a/environment.c
+++ b/environment.c
@@ -11,7 +11,6 @@
 
 char git_default_email[MAX_GITNAME];
 char git_default_name[MAX_GITNAME];
-int use_legacy_headers = 1;
 int trust_executable_bit = 1;
 int has_symlinks = 1;
 int assume_unchanged;
diff --git a/sha1_file.c b/sha1_file.c
index 32244d7..e715527 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -972,7 +972,7 @@ void *map_sha1_file(const unsigned char *sha1, unsigned long *size)
 	return map;
 }
 
-int legacy_loose_object(unsigned char *map)
+static int legacy_loose_object(unsigned char *map)
 {
 	unsigned int word;
 
@@ -1034,6 +1034,14 @@ static int unpack_sha1_header(z_stream *stream, unsigned char *map, unsigned lon
 		return inflate(stream, 0);
 	}
 
+
+	/*
+	 * There used to be a second loose object header format which
+	 * was meant to mimic the in-pack format, allowing for direct
+	 * copy of the object data.  This format turned up not to be
+	 * really worth it and we don't write it any longer.  But we
+	 * can still read it.
+	 */
 	used = unpack_object_header_gently(map, mapsize, &type, &size);
 	if (!used || !valid_loose_object_type[type])
 		return -1;
@@ -1962,40 +1970,6 @@ static int write_buffer(int fd, const void *buf, size_t len)
 	return 0;
 }
 
-static int write_binary_header(unsigned char *hdr, enum object_type type, unsigned long len)
-{
-	int hdr_len;
-	unsigned char c;
-
-	c = (type << 4) | (len & 15);
-	len >>= 4;
-	hdr_len = 1;
-	while (len) {
-		*hdr++ = c | 0x80;
-		hdr_len++;
-		c = (len & 0x7f);
-		len >>= 7;
-	}
-	*hdr = c;
-	return hdr_len;
-}
-
-static void setup_object_header(z_stream *stream, const char *type, unsigned long len)
-{
-	int obj_type, hdrlen;
-
-	if (use_legacy_headers) {
-		while (deflate(stream, 0) == Z_OK)
-			/* nothing */;
-		return;
-	}
-	obj_type = type_from_string(type);
-	hdrlen = write_binary_header(stream->next_out, obj_type, len);
-	stream->total_out = hdrlen;
-	stream->next_out += hdrlen;
-	stream->avail_out -= hdrlen;
-}
-
 int hash_sha1_file(const void *buf, unsigned long len, const char *type,
                    unsigned char *sha1)
 {
@@ -2062,7 +2036,8 @@ int write_sha1_file(void *buf, unsigned long len, const char *type, unsigned cha
 	/* First header.. */
 	stream.next_in = (unsigned char *)hdr;
 	stream.avail_in = hdrlen;
-	setup_object_header(&stream, type, len);
+	while (deflate(&stream, 0) == Z_OK)
+		/* nothing */;
 
 	/* Then the data itself.. */
 	stream.next_in = buf;

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] deprecate the new loose object header format
  2007-05-09 18:42             ` [PATCH] deprecate the new loose object header format Nicolas Pitre
@ 2007-05-09 20:16               ` Dana How
  2007-05-09 20:42                 ` Nicolas Pitre
  0 siblings, 1 reply; 23+ messages in thread
From: Dana How @ 2007-05-09 20:16 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, Git Mailing List, danahow

This doesn't just deprecate the format,
it removes the ability to create it.
So to me this patch goes too far.

Also, if we're interested in "simpler", wouldn't
it be better for loose and in-pack objects to be
the same?  I thought that was the point of
!legacy_headers.  Whatever the decision is,
it certainly has little impact on CPU effort
as you point out.

Maybe I'm too conservative because I'm less
famililar with the code and haven't written it.
Personally I oscillate between adding and refactoring
(you haven't seen any refactoring yet ;-) ).

Thanks,

Dana

On 5/9/07, Nicolas Pitre <nico@cam.org> wrote:
> Now that we encourage and actively preserve objects in a packed form
> more agressively than we did at the time the new loose object format and
> core.legacyheaders were introduced, that extra loose object format
> doesn't appear to be worth it anymore.
>
> Because the packing of loose objects has to go through the delta match
> loop anyway, and since most of them should end up being deltified in
> most cases, there is really little advantage to have this parallel loose
> object format as the CPU savings it might provide is rather lost in the
> noise in the end.
>
> This patch gets rid of core.legacyheaders, preserve the legacy format as
> the only writable loose object format and deprecate the other one to
> keep things simpler.
>
> Signed-off-by: Nicolas Pitre <nico@cam.org>
> ---
>
> On Wed, 9 May 2007, Junio C Hamano wrote:
>
> > I agree with your analysis, especially when deeper delta chains
> > are allowed, straight copy of loose object becomes less and less
> > likely.
>
> So here it is, with a nice code reduction:
>
>  Documentation/config.txt |   13 --------
>  builtin-pack-objects.c   |   69 ----------------------------------------------
>  cache.h                  |    2 -
>  config.c                 |    5 ---
>  environment.c            |    1 -
>  sha1_file.c              |   47 +++++++------------------------
>  6 files changed, 11 insertions(+), 126 deletions(-)
>
> diff --git a/Documentation/config.txt b/Documentation/config.txt
> index ea434af..d6d89ba 100644
> --- a/Documentation/config.txt
> +++ b/Documentation/config.txt
> @@ -209,19 +209,6 @@ core.compression::
>         compression, and 1..9 are various speed/size tradeoffs, 9 being
>         slowest.
>
> -core.legacyheaders::
> -       A boolean which
> -       changes the format of loose objects so that they are more
> -       efficient to pack and to send out of the repository over git
> -       native protocol, since v1.4.2.  However, loose objects
> -       written in the new format cannot be read by git older than
> -       that version; people fetching from your repository using
> -       older versions of git over dumb transports (e.g. http)
> -       will also be affected.
> -+
> -To let git use the new loose object format, you have to
> -set core.legacyheaders to false.
> -
>  core.packedGitWindowSize::
>         Number of bytes of a pack file to map into memory in a
>         single mapping operation.  Larger window sizes may allow
> diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
> index 966f843..c74a361 100644
> --- a/builtin-pack-objects.c
> +++ b/builtin-pack-objects.c
> @@ -346,56 +346,6 @@ static void copy_pack_data(struct sha1file *f,
>         }
>  }
>
> -static int check_loose_inflate(unsigned char *data, unsigned long len, unsigned long expect)
> -{
> -       z_stream stream;
> -       unsigned char fakebuf[4096];
> -       int st;
> -
> -       memset(&stream, 0, sizeof(stream));
> -       stream.next_in = data;
> -       stream.avail_in = len;
> -       stream.next_out = fakebuf;
> -       stream.avail_out = sizeof(fakebuf);
> -       inflateInit(&stream);
> -
> -       while (1) {
> -               st = inflate(&stream, Z_FINISH);
> -               if (st == Z_STREAM_END || st == Z_OK) {
> -                       st = (stream.total_out == expect &&
> -                             stream.total_in == len) ? 0 : -1;
> -                       break;
> -               }
> -               if (st != Z_BUF_ERROR) {
> -                       st = -1;
> -                       break;
> -               }
> -               stream.next_out = fakebuf;
> -               stream.avail_out = sizeof(fakebuf);
> -       }
> -       inflateEnd(&stream);
> -       return st;
> -}
> -
> -static int revalidate_loose_object(struct object_entry *entry,
> -                                  unsigned char *map,
> -                                  unsigned long mapsize)
> -{
> -       /* we already know this is a loose object with new type header. */
> -       enum object_type type;
> -       unsigned long size, used;
> -
> -       if (pack_to_stdout)
> -               return 0;
> -
> -       used = unpack_object_header_gently(map, mapsize, &type, &size);
> -       if (!used)
> -               return -1;
> -       map += used;
> -       mapsize -= used;
> -       return check_loose_inflate(map, mapsize, size);
> -}
> -
>  static unsigned long write_object(struct sha1file *f,
>                                   struct object_entry *entry)
>  {
> @@ -425,25 +375,6 @@ static unsigned long write_object(struct sha1file *f,
>                                  * and we do not need to deltify it.
>                                  */
>
> -       if (!entry->in_pack && !entry->delta) {
> -               unsigned char *map;
> -               unsigned long mapsize;
> -               map = map_sha1_file(entry->sha1, &mapsize);
> -               if (map && !legacy_loose_object(map)) {
> -                       /* We can copy straight into the pack file */
> -                       if (revalidate_loose_object(entry, map, mapsize))
> -                               die("corrupt loose object %s",
> -                                   sha1_to_hex(entry->sha1));
> -                       sha1write(f, map, mapsize);
> -                       munmap(map, mapsize);
> -                       written++;
> -                       reused++;
> -                       return mapsize;
> -               }
> -               if (map)
> -                       munmap(map, mapsize);
> -       }
> -
>         if (!to_reuse) {
>                 buf = read_sha1_file(entry->sha1, &type, &size);
>                 if (!buf)
> diff --git a/cache.h b/cache.h
> index 8e76152..5725bce 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -273,7 +273,6 @@ extern void rollback_lock_file(struct lock_file *);
>  extern int delete_ref(const char *, const unsigned char *sha1);
>
>  /* Environment bits from configuration mechanism */
> -extern int use_legacy_headers;
>  extern int trust_executable_bit;
>  extern int has_symlinks;
>  extern int assume_unchanged;
> @@ -354,7 +353,6 @@ extern int move_temp_to_file(const char *tmpfile, const char *filename);
>  extern int has_sha1_pack(const unsigned char *sha1, const char **ignore);
>  extern int has_sha1_file(const unsigned char *sha1);
>  extern void *map_sha1_file(const unsigned char *sha1, unsigned long *);
> -extern int legacy_loose_object(unsigned char *);
>
>  extern int has_pack_file(const unsigned char *sha1);
>  extern int has_pack_index(const unsigned char *sha1);
> diff --git a/config.c b/config.c
> index 70d1055..298966f 100644
> --- a/config.c
> +++ b/config.c
> @@ -299,11 +299,6 @@ int git_default_config(const char *var, const char *value)
>                 return 0;
>         }
>
> -       if (!strcmp(var, "core.legacyheaders")) {
> -               use_legacy_headers = git_config_bool(var, value);
> -               return 0;
> -       }
> -
>         if (!strcmp(var, "core.compression")) {
>                 int level = git_config_int(var, value);
>                 if (level == -1)
> diff --git a/environment.c b/environment.c
> index 2231659..54e3aba 100644
> --- a/environment.c
> +++ b/environment.c
> @@ -11,7 +11,6 @@
>
>  char git_default_email[MAX_GITNAME];
>  char git_default_name[MAX_GITNAME];
> -int use_legacy_headers = 1;
>  int trust_executable_bit = 1;
>  int has_symlinks = 1;
>  int assume_unchanged;
> diff --git a/sha1_file.c b/sha1_file.c
> index 32244d7..e715527 100644
> --- a/sha1_file.c
> +++ b/sha1_file.c
> @@ -972,7 +972,7 @@ void *map_sha1_file(const unsigned char *sha1, unsigned long *size)
>         return map;
>  }
>
> -int legacy_loose_object(unsigned char *map)
> +static int legacy_loose_object(unsigned char *map)
>  {
>         unsigned int word;
>
> @@ -1034,6 +1034,14 @@ static int unpack_sha1_header(z_stream *stream, unsigned char *map, unsigned lon
>                 return inflate(stream, 0);
>         }
>
> +
> +       /*
> +        * There used to be a second loose object header format which
> +        * was meant to mimic the in-pack format, allowing for direct
> +        * copy of the object data.  This format turned up not to be
> +        * really worth it and we don't write it any longer.  But we
> +        * can still read it.
> +        */
>         used = unpack_object_header_gently(map, mapsize, &type, &size);
>         if (!used || !valid_loose_object_type[type])
>                 return -1;
> @@ -1962,40 +1970,6 @@ static int write_buffer(int fd, const void *buf, size_t len)
>         return 0;
>  }
>
> -static int write_binary_header(unsigned char *hdr, enum object_type type, unsigned long len)
> -{
> -       int hdr_len;
> -       unsigned char c;
> -
> -       c = (type << 4) | (len & 15);
> -       len >>= 4;
> -       hdr_len = 1;
> -       while (len) {
> -               *hdr++ = c | 0x80;
> -               hdr_len++;
> -               c = (len & 0x7f);
> -               len >>= 7;
> -       }
> -       *hdr = c;
> -       return hdr_len;
> -}
> -
> -static void setup_object_header(z_stream *stream, const char *type, unsigned long len)
> -{
> -       int obj_type, hdrlen;
> -
> -       if (use_legacy_headers) {
> -               while (deflate(stream, 0) == Z_OK)
> -                       /* nothing */;
> -               return;
> -       }
> -       obj_type = type_from_string(type);
> -       hdrlen = write_binary_header(stream->next_out, obj_type, len);
> -       stream->total_out = hdrlen;
> -       stream->next_out += hdrlen;
> -       stream->avail_out -= hdrlen;
> -}
> -
>  int hash_sha1_file(const void *buf, unsigned long len, const char *type,
>                     unsigned char *sha1)
>  {
> @@ -2062,7 +2036,8 @@ int write_sha1_file(void *buf, unsigned long len, const char *type, unsigned cha
>         /* First header.. */
>         stream.next_in = (unsigned char *)hdr;
>         stream.avail_in = hdrlen;
> -       setup_object_header(&stream, type, len);
> +       while (deflate(&stream, 0) == Z_OK)
> +               /* nothing */;
>
>         /* Then the data itself.. */
>         stream.next_in = buf;
>


-- 
Dana L. How  danahow@gmail.com  +1 650 804 5991 cell

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] deprecate the new loose object header format
  2007-05-09 20:16               ` Dana How
@ 2007-05-09 20:42                 ` Nicolas Pitre
  2007-05-09 21:00                   ` Dana How
  0 siblings, 1 reply; 23+ messages in thread
From: Nicolas Pitre @ 2007-05-09 20:42 UTC (permalink / raw)
  To: Dana How; +Cc: Junio C Hamano, Git Mailing List

On Wed, 9 May 2007, Dana How wrote:

> This doesn't just deprecate the format,
> it removes the ability to create it.
> So to me this patch goes too far.

Well, I disagree.  We cannot create it anymore, but we still can read 
it.  Older Git versions nay not even read it.  And since this format was 
off by default anyway, I doubt you've lost anything.

> Also, if we're interested in "simpler", wouldn't
> it be better for loose and in-pack objects to be
> the same?  I thought that was the point of
> !legacy_headers.

It is not simpler because:

 1) backward compatibility requires the legacy format, and

 2) the object SHA1 is always computed with the legacy header included.

So what is simpler is really to get rid of over 100 lines of code that 
didn't provide a real benefit.  The faster we remove the ability to 
write such objects the fewer they'll be in the field.


Nicolas

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] deprecate the new loose object header format
  2007-05-09 20:42                 ` Nicolas Pitre
@ 2007-05-09 21:00                   ` Dana How
  0 siblings, 0 replies; 23+ messages in thread
From: Dana How @ 2007-05-09 21:00 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, Git Mailing List, danahow

On 5/9/07, Nicolas Pitre <nico@cam.org> wrote:
> It is not simpler because:
>  2) the object SHA1 is always computed with the legacy header included.
That convinces me.

Thanks,
-- 
Dana L. How  danahow@gmail.com  +1 650 804 5991 cell

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2007-05-09 21:01 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-05-08 22:38 [PATCH v2] Custom compression levels for objects and packs Dana How
2007-05-08 23:56 ` Junio C Hamano
2007-05-09  0:16   ` Nicolas Pitre
2007-05-09  0:29     ` Dana How
2007-05-09  1:03       ` Nicolas Pitre
2007-05-09  6:46         ` Dana How
2007-05-09  7:13           ` Junio C Hamano
2007-05-09  0:25   ` Dana How
2007-05-09  1:23     ` Nicolas Pitre
2007-05-09  9:21       ` Dana How
2007-05-09 15:27         ` Nicolas Pitre
2007-05-09 16:26           ` Junio C Hamano
2007-05-09 16:42             ` Dana How
2007-05-09 16:59             ` [PATCH] make "repack -f" imply "pack-objects --no-reuse-object" Nicolas Pitre
2007-05-09 18:42             ` [PATCH] deprecate the new loose object header format Nicolas Pitre
2007-05-09 20:16               ` Dana How
2007-05-09 20:42                 ` Nicolas Pitre
2007-05-09 21:00                   ` Dana How
2007-05-09  5:59     ` [PATCH v2] Custom compression levels for objects and packs Junio C Hamano
2007-05-09  6:24       ` Dana How
2007-05-09  0:30 ` Petr Baudis
2007-05-09 13:56 ` Theodore Tso
2007-05-09 16:44   ` Dana How

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).