[PATCH] update-server-info: avoid needless overwrites

git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed

* [PATCH] update-server-info: avoid needless overwrites
@ 2019-05-11  1:34 Eric Wong
  2019-05-11  7:35 ` Eric Sunshine
                   ` (3 more replies)
  0 siblings, 4 replies; 26+ messages in thread
From: Eric Wong @ 2019-05-11  1:34 UTC (permalink / raw)
  To: git

Do not change the existing info/refs and objects/info/packs
files if they match the existing content on the filesystem.
This is intended to preserve mtime and make it easier for dumb
HTTP pollers to rely on the If-Modified-Since header.

Combined with stdio and kernel buffering; the kernel should be
able to avoid block layer writes and reduce wear.

Signed-off-by: Eric Wong <e@80x24.org>
---
 server-info.c                 | 61 +++++++++++++++++++++++++++++------
 t/t5200-update-server-info.sh | 41 +++++++++++++++++++++++
 2 files changed, 92 insertions(+), 10 deletions(-)
 create mode 100755 t/t5200-update-server-info.sh

diff --git a/server-info.c b/server-info.c
index 41274d098b..34599e4817 100644
--- a/server-info.c
+++ b/server-info.c
@@ -6,37 +6,78 @@
 #include "tag.h"
 #include "packfile.h"
 #include "object-store.h"
+#include "strbuf.h"
+
+static int files_differ(FILE *fp, const char *path)
+{
+	struct stat st;
+	git_hash_ctx c;
+	struct object_id oid_old, oid_new;
+	struct strbuf tmp = STRBUF_INIT;
+	long new_len = ftell(fp);
+
+	if (new_len < 0 || stat(path, &st) < 0)
+		return 1;
+	if (!S_ISREG(st.st_mode))
+		return 1;
+	if ((off_t)new_len != st.st_size)
+		return 1;
+
+	rewind(fp);
+	if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len)
+		return 1;
+	the_hash_algo->init_fn(&c);
+	the_hash_algo->update_fn(&c, tmp.buf, tmp.len);
+	the_hash_algo->final_fn(oid_new.hash, &c);
+	strbuf_release(&tmp);
+
+	if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0)
+		return 1;
+	the_hash_algo->init_fn(&c);
+	the_hash_algo->update_fn(&c, tmp.buf, tmp.len);
+	the_hash_algo->final_fn(oid_old.hash, &c);
+	strbuf_release(&tmp);
+
+	return hashcmp(oid_old.hash, oid_new.hash);
+}
 
 /*
  * Create the file "path" by writing to a temporary file and renaming
  * it into place. The contents of the file come from "generate", which
  * should return non-zero if it encounters an error.
  */
-static int update_info_file(char *path, int (*generate)(FILE *))
+static int update_info_file(char *path, int (*generate)(FILE *), int force)
 {
 	char *tmp = mkpathdup("%s_XXXXXX", path);
 	int ret = -1;
 	int fd = -1;
 	FILE *fp = NULL, *to_close;
+	int do_update;
 
 	safe_create_leading_directories(path);
 	fd = git_mkstemp_mode(tmp, 0666);
 	if (fd < 0)
 		goto out;
-	to_close = fp = fdopen(fd, "w");
+	to_close = fp = fdopen(fd, "w+");
 	if (!fp)
 		goto out;
 	fd = -1;
 	ret = generate(fp);
 	if (ret)
 		goto out;
+
+	do_update = force || files_differ(fp, path);
 	fp = NULL;
 	if (fclose(to_close))
 		goto out;
-	if (adjust_shared_perm(tmp) < 0)
-		goto out;
-	if (rename(tmp, path) < 0)
-		goto out;
+	if (do_update) {
+		if (adjust_shared_perm(tmp) < 0)
+			goto out;
+		if (rename(tmp, path) < 0)
+			goto out;
+	} else {
+		unlink(tmp);
+	}
 	ret = 0;
 
 out:
@@ -78,10 +119,10 @@ static int generate_info_refs(FILE *fp)
 	return for_each_ref(add_info_ref, fp);
 }
 
-static int update_info_refs(void)
+static int update_info_refs(int force)
 {
 	char *path = git_pathdup("info/refs");
-	int ret = update_info_file(path, generate_info_refs);
+	int ret = update_info_file(path, generate_info_refs, force);
 	free(path);
 	return ret;
 }
@@ -254,7 +295,7 @@ static int update_info_packs(int force)
 	int ret;
 
 	init_pack_info(infofile, force);
-	ret = update_info_file(infofile, write_pack_info_file);
+	ret = update_info_file(infofile, write_pack_info_file, force);
 	free_pack_info();
 	free(infofile);
 	return ret;
@@ -269,7 +310,7 @@ int update_server_info(int force)
 	 */
 	int errs = 0;
 
-	errs = errs | update_info_refs();
+	errs = errs | update_info_refs(force);
 	errs = errs | update_info_packs(force);
 
 	/* remove leftover rev-cache file if there is any */
diff --git a/t/t5200-update-server-info.sh b/t/t5200-update-server-info.sh
new file mode 100755
index 0000000000..f1cec46914
--- /dev/null
+++ b/t/t5200-update-server-info.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+test_description='Test git stash show configuration.'
+
+. ./test-lib.sh
+
+test_expect_success 'setup' 'test_commit file'
+
+test_expect_success 'create info/refs' '
+	git update-server-info &&
+	test_path_is_file .git/info/refs
+'
+
+test_expect_success 'modify and store mtime' '
+	test-tool chmtime =0 .git/info/refs &&
+	test-tool chmtime --get .git/info/refs >a
+'
+
+test_expect_success 'info/refs is not needlessly overwritten' '
+	git update-server-info &&
+	test-tool chmtime --get .git/info/refs >b &&
+	test_cmp a b
+'
+
+test_expect_success 'info/refs can be forced to update' '
+	git update-server-info -f &&
+	test-tool chmtime --get .git/info/refs >b &&
+	! test_cmp a b
+'
+
+test_expect_success 'info/refs updates when changes are made' '
+	test-tool chmtime =0 .git/info/refs &&
+	test-tool chmtime --get .git/info/refs >b &&
+	test_cmp a b &&
+	git update-ref refs/heads/foo HEAD &&
+	git update-server-info &&
+	test-tool chmtime --get .git/info/refs >b &&
+	! test_cmp a b
+'
+
+test_done
-- 
EW


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] update-server-info: avoid needless overwrites
  2019-05-11  1:34 [PATCH] update-server-info: avoid needless overwrites Eric Wong
@ 2019-05-11  7:35 ` Eric Sunshine
  2019-05-11 20:47   ` [PATCH v2] " Eric Wong
  2019-05-11 21:17 ` [PATCH] " Eric Wong
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 26+ messages in thread
From: Eric Sunshine @ 2019-05-11  7:35 UTC (permalink / raw)
  To: Eric Wong; +Cc: Git List

On Fri, May 10, 2019 at 9:35 PM Eric Wong <e@80x24.org> wrote:
> Do not change the existing info/refs and objects/info/packs
> files if they match the existing content on the filesystem.
> This is intended to preserve mtime and make it easier for dumb
> HTTP pollers to rely on the If-Modified-Since header.
> [...]
> Signed-off-by: Eric Wong <e@80x24.org>
> ---
> diff --git a/server-info.c b/server-info.c
> @@ -6,37 +6,78 @@
> +static int files_differ(FILE *fp, const char *path)
> +{
> +       [...]
> +       struct strbuf tmp = STRBUF_INIT;
> +       [...]
> +       if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len)
> +               return 1;

Although strbuf_fread() will release 'tmp' automatically if the read
actually fails, it won't do so otherwise. So, if it reads more or
fewer bytes than expected, for some reason, then this early return
will leak the strbuf, won't it?

> +       the_hash_algo->init_fn(&c);
> +       the_hash_algo->update_fn(&c, tmp.buf, tmp.len);
> +       the_hash_algo->final_fn(oid_new.hash, &c);
> +       strbuf_release(&tmp);
> +
> +       if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0)
> +               return 1;

Same issue.

> diff --git a/t/t5200-update-server-info.sh b/t/t5200-update-server-info.sh
> @@ -0,0 +1,41 @@
> +test_description='Test git stash show configuration.'

Copy/paste error for test description?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2] update-server-info: avoid needless overwrites
  2019-05-11  7:35 ` Eric Sunshine
@ 2019-05-11 20:47   ` Eric Wong
  0 siblings, 0 replies; 26+ messages in thread
From: Eric Wong @ 2019-05-11 20:47 UTC (permalink / raw)
  To: Eric Sunshine; +Cc: git

Eric Sunshine <sunshine@sunshineco.com> wrote:
> On Fri, May 10, 2019 at 9:35 PM Eric Wong <e@80x24.org> wrote:
> > +       if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len)
> > +               return 1;
> 
> Although strbuf_fread() will release 'tmp' automatically if the read
> actually fails, it won't do so otherwise. So, if it reads more or
> fewer bytes than expected, for some reason, then this early return
> will leak the strbuf, won't it?

> > +       if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0)
> > +               return 1;
> 
> Same issue.
> 
> > diff --git a/t/t5200-update-server-info.sh b/t/t5200-update-server-info.sh
> > @@ -0,0 +1,41 @@
> > +test_description='Test git stash show configuration.'
> 
> Copy/paste error for test description?

Yup, thanks for the review.  Updated patch below

-----8<------
Subject: [PATCH] update-server-info: avoid needless overwrites

Do not change the existing info/refs and objects/info/packs
files if they match the existing content on the filesystem.
This is intended to preserve mtime and make it easier for dumb
HTTP pollers to rely on the If-Modified-Since header.

Combined with stdio and kernel buffering; the kernel should be
able to avoid block layer writes and reduce wear.

Signed-off-by: Eric Wong <e@80x24.org>
---
Interdiff:
  diff --git a/server-info.c b/server-info.c
  index 34599e4817..11515804d4 100644
  --- a/server-info.c
  +++ b/server-info.c
  @@ -24,15 +24,19 @@ static int files_differ(FILE *fp, const char *path)
   		return 1;
   
   	rewind(fp);
  -	if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len)
  +	if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len) {
  +		strbuf_release(&tmp);
   		return 1;
  +	}
   	the_hash_algo->init_fn(&c);
   	the_hash_algo->update_fn(&c, tmp.buf, tmp.len);
   	the_hash_algo->final_fn(oid_new.hash, &c);
   	strbuf_release(&tmp);
   
  -	if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0)
  +	if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0) {
  +		strbuf_release(&tmp);
   		return 1;
  +	}
   	the_hash_algo->init_fn(&c);
   	the_hash_algo->update_fn(&c, tmp.buf, tmp.len);
   	the_hash_algo->final_fn(oid_old.hash, &c);
  diff --git a/t/t5200-update-server-info.sh b/t/t5200-update-server-info.sh
  index f1cec46914..21a58eecb9 100755
  --- a/t/t5200-update-server-info.sh
  +++ b/t/t5200-update-server-info.sh
  @@ -1,6 +1,6 @@
   #!/bin/sh
   
  -test_description='Test git stash show configuration.'
  +test_description='Test git update-server-info'
   
   . ./test-lib.sh
   

 server-info.c                 | 65 +++++++++++++++++++++++++++++------
 t/t5200-update-server-info.sh | 41 ++++++++++++++++++++++
 2 files changed, 96 insertions(+), 10 deletions(-)
 create mode 100755 t/t5200-update-server-info.sh

diff --git a/server-info.c b/server-info.c
index 41274d098b..11515804d4 100644
--- a/server-info.c
+++ b/server-info.c
@@ -6,37 +6,82 @@
 #include "tag.h"
 #include "packfile.h"
 #include "object-store.h"
+#include "strbuf.h"
+
+static int files_differ(FILE *fp, const char *path)
+{
+	struct stat st;
+	git_hash_ctx c;
+	struct object_id oid_old, oid_new;
+	struct strbuf tmp = STRBUF_INIT;
+	long new_len = ftell(fp);
+
+	if (new_len < 0 || stat(path, &st) < 0)
+		return 1;
+	if (!S_ISREG(st.st_mode))
+		return 1;
+	if ((off_t)new_len != st.st_size)
+		return 1;
+
+	rewind(fp);
+	if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len) {
+		strbuf_release(&tmp);
+		return 1;
+	}
+	the_hash_algo->init_fn(&c);
+	the_hash_algo->update_fn(&c, tmp.buf, tmp.len);
+	the_hash_algo->final_fn(oid_new.hash, &c);
+	strbuf_release(&tmp);
+
+	if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0) {
+		strbuf_release(&tmp);
+		return 1;
+	}
+	the_hash_algo->init_fn(&c);
+	the_hash_algo->update_fn(&c, tmp.buf, tmp.len);
+	the_hash_algo->final_fn(oid_old.hash, &c);
+	strbuf_release(&tmp);
+
+	return hashcmp(oid_old.hash, oid_new.hash);
+}
 
 /*
  * Create the file "path" by writing to a temporary file and renaming
  * it into place. The contents of the file come from "generate", which
  * should return non-zero if it encounters an error.
  */
-static int update_info_file(char *path, int (*generate)(FILE *))
+static int update_info_file(char *path, int (*generate)(FILE *), int force)
 {
 	char *tmp = mkpathdup("%s_XXXXXX", path);
 	int ret = -1;
 	int fd = -1;
 	FILE *fp = NULL, *to_close;
+	int do_update;
 
 	safe_create_leading_directories(path);
 	fd = git_mkstemp_mode(tmp, 0666);
 	if (fd < 0)
 		goto out;
-	to_close = fp = fdopen(fd, "w");
+	to_close = fp = fdopen(fd, "w+");
 	if (!fp)
 		goto out;
 	fd = -1;
 	ret = generate(fp);
 	if (ret)
 		goto out;
+
+	do_update = force || files_differ(fp, path);
 	fp = NULL;
 	if (fclose(to_close))
 		goto out;
-	if (adjust_shared_perm(tmp) < 0)
-		goto out;
-	if (rename(tmp, path) < 0)
-		goto out;
+	if (do_update) {
+		if (adjust_shared_perm(tmp) < 0)
+			goto out;
+		if (rename(tmp, path) < 0)
+			goto out;
+	} else {
+		unlink(tmp);
+	}
 	ret = 0;
 
 out:
@@ -78,10 +123,10 @@ static int generate_info_refs(FILE *fp)
 	return for_each_ref(add_info_ref, fp);
 }
 
-static int update_info_refs(void)
+static int update_info_refs(int force)
 {
 	char *path = git_pathdup("info/refs");
-	int ret = update_info_file(path, generate_info_refs);
+	int ret = update_info_file(path, generate_info_refs, force);
 	free(path);
 	return ret;
 }
@@ -254,7 +299,7 @@ static int update_info_packs(int force)
 	int ret;
 
 	init_pack_info(infofile, force);
-	ret = update_info_file(infofile, write_pack_info_file);
+	ret = update_info_file(infofile, write_pack_info_file, force);
 	free_pack_info();
 	free(infofile);
 	return ret;
@@ -269,7 +314,7 @@ int update_server_info(int force)
 	 */
 	int errs = 0;
 
-	errs = errs | update_info_refs();
+	errs = errs | update_info_refs(force);
 	errs = errs | update_info_packs(force);
 
 	/* remove leftover rev-cache file if there is any */
diff --git a/t/t5200-update-server-info.sh b/t/t5200-update-server-info.sh
new file mode 100755
index 0000000000..21a58eecb9
--- /dev/null
+++ b/t/t5200-update-server-info.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+test_description='Test git update-server-info'
+
+. ./test-lib.sh
+
+test_expect_success 'setup' 'test_commit file'
+
+test_expect_success 'create info/refs' '
+	git update-server-info &&
+	test_path_is_file .git/info/refs
+'
+
+test_expect_success 'modify and store mtime' '
+	test-tool chmtime =0 .git/info/refs &&
+	test-tool chmtime --get .git/info/refs >a
+'
+
+test_expect_success 'info/refs is not needlessly overwritten' '
+	git update-server-info &&
+	test-tool chmtime --get .git/info/refs >b &&
+	test_cmp a b
+'
+
+test_expect_success 'info/refs can be forced to update' '
+	git update-server-info -f &&
+	test-tool chmtime --get .git/info/refs >b &&
+	! test_cmp a b
+'
+
+test_expect_success 'info/refs updates when changes are made' '
+	test-tool chmtime =0 .git/info/refs &&
+	test-tool chmtime --get .git/info/refs >b &&
+	test_cmp a b &&
+	git update-ref refs/heads/foo HEAD &&
+	git update-server-info &&
+	test-tool chmtime --get .git/info/refs >b &&
+	! test_cmp a b
+'
+
+test_done
-- 
EW

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] update-server-info: avoid needless overwrites
  2019-05-11  1:34 [PATCH] update-server-info: avoid needless overwrites Eric Wong
  2019-05-11  7:35 ` Eric Sunshine
@ 2019-05-11 21:17 ` Eric Wong
  2019-05-11 23:37 ` Ævar Arnfjörð Bjarmason
  2019-05-13 23:17 ` [PATCH v3] update-server-info: avoid needless overwrites Eric Wong
  3 siblings, 0 replies; 26+ messages in thread
From: Eric Wong @ 2019-05-11 21:17 UTC (permalink / raw)
  To: git

Eric Wong <e@80x24.org> wrote:
> Combined with stdio and kernel buffering; the kernel should be
> able to avoid block layer writes and reduce wear.

Along the same lines, I'd like to get repack to stop recreating
identical packs at some point (but that could be months from now...)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] update-server-info: avoid needless overwrites
  2019-05-11  1:34 [PATCH] update-server-info: avoid needless overwrites Eric Wong
  2019-05-11  7:35 ` Eric Sunshine
  2019-05-11 21:17 ` [PATCH] " Eric Wong
@ 2019-05-11 23:37 ` Ævar Arnfjörð Bjarmason
  2019-05-12  0:38   ` Eric Wong
  2019-05-12  4:08   ` Jeff King
  2019-05-13 23:17 ` [PATCH v3] update-server-info: avoid needless overwrites Eric Wong
  3 siblings, 2 replies; 26+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-05-11 23:37 UTC (permalink / raw)
  To: Eric Wong; +Cc: git, Jeff King, Junio C Hamano

On Sat, May 11 2019, Eric Wong wrote:

> Do not change the existing info/refs and objects/info/packs
> files if they match the existing content on the filesystem.
> This is intended to preserve mtime and make it easier for dumb
> HTTP pollers to rely on the If-Modified-Since header.
>
> Combined with stdio and kernel buffering; the kernel should be
> able to avoid block layer writes and reduce wear.
>
> Signed-off-by: Eric Wong <e@80x24.org>
> ---
>  server-info.c                 | 61 +++++++++++++++++++++++++++++------
>  t/t5200-update-server-info.sh | 41 +++++++++++++++++++++++
>  2 files changed, 92 insertions(+), 10 deletions(-)
>  create mode 100755 t/t5200-update-server-info.sh
>
> diff --git a/server-info.c b/server-info.c
> index 41274d098b..34599e4817 100644
> --- a/server-info.c
> +++ b/server-info.c
> @@ -6,37 +6,78 @@
>  #include "tag.h"
>  #include "packfile.h"
>  #include "object-store.h"
> +#include "strbuf.h"
> +
> +static int files_differ(FILE *fp, const char *path)
> +{
> +	struct stat st;
> +	git_hash_ctx c;
> +	struct object_id oid_old, oid_new;
> +	struct strbuf tmp = STRBUF_INIT;
> +	long new_len = ftell(fp);
> +
> +	if (new_len < 0 || stat(path, &st) < 0)
> +		return 1;
> +	if (!S_ISREG(st.st_mode))
> +		return 1;
> +	if ((off_t)new_len != st.st_size)
> +		return 1;
> +
> +	rewind(fp);
> +	if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len)
> +		return 1;
> +	the_hash_algo->init_fn(&c);
> +	the_hash_algo->update_fn(&c, tmp.buf, tmp.len);
> +	the_hash_algo->final_fn(oid_new.hash, &c);
> +	strbuf_release(&tmp);
> +
> +	if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0)
> +		return 1;
> +	the_hash_algo->init_fn(&c);
> +	the_hash_algo->update_fn(&c, tmp.buf, tmp.len);
> +	the_hash_algo->final_fn(oid_old.hash, &c);
> +	strbuf_release(&tmp);
> +
> +	return hashcmp(oid_old.hash, oid_new.hash);
> +}

This way of doing it just seems so weirdly convoluted. Read them one at
a time, compute the SHA-1, just to see if they're different. Why not
something closer to a plain memcmp():

	static int files_differ(FILE *fp, const char *path)
	{
		struct strbuf old = STRBUF_INIT, new = STRBUF_INIT;
		long new_len = ftell(fp);
		int diff = 1;

		rewind(fp);
		if (strbuf_fread(&new, (size_t)new_len, fp) != (size_t)new_len)
			goto release_return;
		if (strbuf_read_file(&old, path, 0) < 0)
			goto release_return;

		diff = strbuf_cmp(&old, &new);

	release_return:
		strbuf_release(&old);
		strbuf_release(&new);

		return diff;
	}

I.e. optimze for code simplicity with something close to a dumb "cmp
--silent". I'm going to make the bold claim that nobody using "dumb
http" is going to be impacted by the performance of reading their
for-each-ref or for-each-pack dump, hence opting for not even bothing to
stat() to get the size before reading.

Because really, if we were *trying* to micro-optimize this for time or
memory use there's much better ways, e.g. reading the old file and
memcmp() as we go and stream the "generate" callback, but I just don't
see the point of trying in this case.

>  /*
>   * Create the file "path" by writing to a temporary file and renaming
>   * it into place. The contents of the file come from "generate", which
>   * should return non-zero if it encounters an error.
>   */
> -static int update_info_file(char *path, int (*generate)(FILE *))
> +static int update_info_file(char *path, int (*generate)(FILE *), int force)
>  {
>  	char *tmp = mkpathdup("%s_XXXXXX", path);

Unrelated to this, patch, but I hadn't thought about this nasty race
condition. We recommend users run this from the "post-update" (or
"post-receive") hook, and don't juggle the lock along with the ref
update, thus due to the vagaries of scheduling you can end up with two
concurrent ref updates where the "old" one wins.

But I guess that brings me back to something close to "nobody with that
sort of update rate is using 'dumb http'" :)

For this to work properly we'd need to take some sort of global "ref
update/pack update" lock, and I guess at that point this "cmp" case
would be a helper similar to commit_lock_file_to(),
i.e. a commit_lock_file_to_if_different().

>  	int ret = -1;
>  	int fd = -1;
>  	FILE *fp = NULL, *to_close;
> +	int do_update;
>
>  	safe_create_leading_directories(path);
>  	fd = git_mkstemp_mode(tmp, 0666);
>  	if (fd < 0)
>  		goto out;
> -	to_close = fp = fdopen(fd, "w");
> +	to_close = fp = fdopen(fd, "w+");
>  	if (!fp)
>  		goto out;
>  	fd = -1;
>  	ret = generate(fp);
>  	if (ret)
>  		goto out;
> +
> +	do_update = force || files_differ(fp, path);
> [...]
>
> -static int update_info_refs(void)
> +static int update_info_refs(int force)

So, I was going to say "shouldn't we update the docs?" which for --force
say "Update the info files from scratch.".

But reading through it that "from scratch" wording is from c743e6e3c0
("Add a link from update-server-info documentation to repository
layout.", 2005-09-04).

There it was a refrence to a bug since fixed in 60d0526aaa ("Unoptimize
info/refs creation.", 2005-09-14), and then removed from the docs in
c5fe5b6de9 ("Remove obsolete bug warning in man git-update-server-info",
2009-04-25).

Then in b3223761c8 ("update_info_refs(): drop unused force parameter",
2019-04-05) Jeff removed the unused-for-a-decade "force" param.

So having gone through the history I think we're better off just
dropping the --force argument entirely, or at least changing the
docs.

Before this change the only thing it was doing was pruning stuff we
haven't written since 2005-ish (see 3e15c67c90 ("server-info: throw away
T computation as well.", 2005-12-04)), rather than "detect if useless"
we should just write out the file again, and then skip if changed
(i.e. this logic).

That would also take care of the parse_pack_def() case by proxy, i.e. we
don't need some "is stale if pack is missing" special case if we just
always write out a new file (or not if it memcmp's the same as the "old"
one), we want to change the file if the packs are updated anyway

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] update-server-info: avoid needless overwrites
  2019-05-11 23:37 ` Ævar Arnfjörð Bjarmason
@ 2019-05-12  0:38   ` Eric Wong
  2019-05-12  4:08   ` Jeff King
  1 sibling, 0 replies; 26+ messages in thread
From: Eric Wong @ 2019-05-12  0:38 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason; +Cc: git, Jeff King, Junio C Hamano

Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
> On Sat, May 11 2019, Eric Wong wrote:
> > +static int files_differ(FILE *fp, const char *path)
> > +{
> > +	struct stat st;
> > +	git_hash_ctx c;
> > +	struct object_id oid_old, oid_new;
> > +	struct strbuf tmp = STRBUF_INIT;
> > +	long new_len = ftell(fp);
> > +
> > +	if (new_len < 0 || stat(path, &st) < 0)
> > +		return 1;
> > +	if (!S_ISREG(st.st_mode))
> > +		return 1;
> > +	if ((off_t)new_len != st.st_size)
> > +		return 1;
> > +
> > +	rewind(fp);
> > +	if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len)
> > +		return 1;
> > +	the_hash_algo->init_fn(&c);
> > +	the_hash_algo->update_fn(&c, tmp.buf, tmp.len);
> > +	the_hash_algo->final_fn(oid_new.hash, &c);
> > +	strbuf_release(&tmp);
> > +
> > +	if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0)
> > +		return 1;
> > +	the_hash_algo->init_fn(&c);
> > +	the_hash_algo->update_fn(&c, tmp.buf, tmp.len);
> > +	the_hash_algo->final_fn(oid_old.hash, &c);
> > +	strbuf_release(&tmp);
> > +
> > +	return hashcmp(oid_old.hash, oid_new.hash);
> > +}
> 
> This way of doing it just seems so weirdly convoluted. Read them one at
> a time, compute the SHA-1, just to see if they're different. Why not
> something closer to a plain memcmp():
> 
> 	static int files_differ(FILE *fp, const char *path)
> 	{
> 		struct strbuf old = STRBUF_INIT, new = STRBUF_INIT;
> 		long new_len = ftell(fp);
> 		int diff = 1;
> 
> 		rewind(fp);
> 		if (strbuf_fread(&new, (size_t)new_len, fp) != (size_t)new_len)
> 			goto release_return;
> 		if (strbuf_read_file(&old, path, 0) < 0)
> 			goto release_return;
> 
> 		diff = strbuf_cmp(&old, &new);
> 
> 	release_return:
> 		strbuf_release(&old);
> 		strbuf_release(&new);
> 
> 		return diff;
> 	}
> 
> I.e. optimze for code simplicity with something close to a dumb "cmp
> --silent". I'm going to make the bold claim that nobody using "dumb
> http" is going to be impacted by the performance of reading their
> for-each-ref or for-each-pack dump, hence opting for not even bothing to
> stat() to get the size before reading.

I've been trying to improve dumb HTTP for more cases; actually.
(since it's much cheaper than smart HTTP in server memory/CPU)

> Because really, if we were *trying* to micro-optimize this for time or
> memory use there's much better ways, e.g. reading the old file and
> memcmp() as we go and stream the "generate" callback, but I just don't
> see the point of trying in this case.

I was actually going towards that route; but wasn't sure if this
idea would be accepted at all (and I've been trying to stay away
from using non-scripting languages).

I don't slurping all of info/refs into memory at all; so maybe
a streaming memcmp of the existing file is worth doing...

> >  /*
> >   * Create the file "path" by writing to a temporary file and renaming
> >   * it into place. The contents of the file come from "generate", which
> >   * should return non-zero if it encounters an error.
> >   */
> > -static int update_info_file(char *path, int (*generate)(FILE *))
> > +static int update_info_file(char *path, int (*generate)(FILE *), int force)
> >  {
> >  	char *tmp = mkpathdup("%s_XXXXXX", path);
> 
> Unrelated to this, patch, but I hadn't thought about this nasty race
> condition. We recommend users run this from the "post-update" (or
> "post-receive") hook, and don't juggle the lock along with the ref
> update, thus due to the vagaries of scheduling you can end up with two
> concurrent ref updates where the "old" one wins.
> 
> But I guess that brings me back to something close to "nobody with that
> sort of update rate is using 'dumb http'" :)
> 
> For this to work properly we'd need to take some sort of global "ref
> update/pack update" lock, and I guess at that point this "cmp" case
> would be a helper similar to commit_lock_file_to(),
> i.e. a commit_lock_file_to_if_different().

Worth a separate patch, at some point, I think.  I'm not too
familiar with the existing locking in git, actually...

Along those lines, I think repack/gc should automatically
update objects/info/packs if the file already exists.

> >  	int ret = -1;
> >  	int fd = -1;
> >  	FILE *fp = NULL, *to_close;
> > +	int do_update;
> >
> >  	safe_create_leading_directories(path);
> >  	fd = git_mkstemp_mode(tmp, 0666);
> >  	if (fd < 0)
> >  		goto out;
> > -	to_close = fp = fdopen(fd, "w");
> > +	to_close = fp = fdopen(fd, "w+");
> >  	if (!fp)
> >  		goto out;
> >  	fd = -1;
> >  	ret = generate(fp);
> >  	if (ret)
> >  		goto out;
> > +
> > +	do_update = force || files_differ(fp, path);
> > [...]
> >
> > -static int update_info_refs(void)
> > +static int update_info_refs(int force)
> 
> So, I was going to say "shouldn't we update the docs?" which for --force
> say "Update the info files from scratch.".
> 
> But reading through it that "from scratch" wording is from c743e6e3c0
> ("Add a link from update-server-info documentation to repository
> layout.", 2005-09-04).

Yes, that wording is awkward and I can update it.  But maybe making
it undocumented is sufficient and would save us the trouble
of describing it :)

"--force" might be seen as a performance optimization for cases
where you're certain the result will differ, but I'm not
sure if that's worth mentioning in the manpage.

> There it was a refrence to a bug since fixed in 60d0526aaa ("Unoptimize
> info/refs creation.", 2005-09-14), and then removed from the docs in
> c5fe5b6de9 ("Remove obsolete bug warning in man git-update-server-info",
> 2009-04-25).
> 
> Then in b3223761c8 ("update_info_refs(): drop unused force parameter",
> 2019-04-05) Jeff removed the unused-for-a-decade "force" param.
> 
> So having gone through the history I think we're better off just
> dropping the --force argument entirely, or at least changing the
> docs.

I can update the docs, or make it undocumented.  Compatibility
from the command-line needs to remain in case there are scripts
using it.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] update-server-info: avoid needless overwrites
  2019-05-11 23:37 ` Ævar Arnfjörð Bjarmason
  2019-05-12  0:38   ` Eric Wong
@ 2019-05-12  4:08   ` Jeff King
  2019-05-12  7:16     ` Ævar Arnfjörð Bjarmason
  1 sibling, 1 reply; 26+ messages in thread
From: Jeff King @ 2019-05-12  4:08 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason; +Cc: Eric Wong, git, Junio C Hamano

On Sun, May 12, 2019 at 01:37:55AM +0200, Ævar Arnfjörð Bjarmason wrote:

> This way of doing it just seems so weirdly convoluted. Read them one at
> a time, compute the SHA-1, just to see if they're different. Why not
> something closer to a plain memcmp():

FWIW, I had the exact same thought on reading the patch. Checking sizes
seems like an easy optimization and I don't mind it, but computing
hashes when we could just compare the bytes seems pointless. I'd have
expected to stream 4k blocks as we go.

> I.e. optimze for code simplicity with something close to a dumb "cmp
> --silent". I'm going to make the bold claim that nobody using "dumb
> http" is going to be impacted by the performance of reading their
> for-each-ref or for-each-pack dump, hence opting for not even bothing to
> stat() to get the size before reading.

You're probably right (especially because we'd just spent O(n) work
generating the candidate file). But note that I have seen pathological
cases where info/refs was gigabytes.

> >  /*
> >   * Create the file "path" by writing to a temporary file and renaming
> >   * it into place. The contents of the file come from "generate", which
> >   * should return non-zero if it encounters an error.
> >   */
> > -static int update_info_file(char *path, int (*generate)(FILE *))
> > +static int update_info_file(char *path, int (*generate)(FILE *), int force)
> >  {
> >  	char *tmp = mkpathdup("%s_XXXXXX", path);
> 
> Unrelated to this, patch, but I hadn't thought about this nasty race
> condition. We recommend users run this from the "post-update" (or
> "post-receive") hook, and don't juggle the lock along with the ref
> update, thus due to the vagaries of scheduling you can end up with two
> concurrent ref updates where the "old" one wins.
> 
> But I guess that brings me back to something close to "nobody with that
> sort of update rate is using 'dumb http'" :)
> 
> For this to work properly we'd need to take some sort of global "ref
> update/pack update" lock, and I guess at that point this "cmp" case
> would be a helper similar to commit_lock_file_to(),
> i.e. a commit_lock_file_to_if_different().

I don't think our usual dot-locking is great here. What does the
race-loser do when it can't take the lock? It can't just skip its
update, since it needs to make sure that its new pack is mentioned.

So it has to wait and poll until the other process finishes. I guess
maybe that isn't the end of the world.

> Then in b3223761c8 ("update_info_refs(): drop unused force parameter",
> 2019-04-05) Jeff removed the unused-for-a-decade "force" param.
> 
> So having gone through the history I think we're better off just
> dropping the --force argument entirely, or at least changing the
> docs.
> 
> Before this change the only thing it was doing was pruning stuff we
> haven't written since 2005-ish (see 3e15c67c90 ("server-info: throw away
> T computation as well.", 2005-12-04)), rather than "detect if useless"
> we should just write out the file again, and then skip if changed
> (i.e. this logic).

Yeah, my commit only ripped out the useless force parameter for
info/refs. For info/packs, there's still that weird "is is stale"
computation (which I fixed several bugs in). It's not entirely clear to
me if that can just go away, but I agree that if we can it's simpler and
more desirable to just generate the candidate result and see if it's
bit-for-bit identical or not.

I'm not entirely sure of all of the magic that "stale" check is trying
to accomplish. I think there's some bits in there that try to preserve
the existing ordering, but I don't know why anyone would care (and there
are so many cases where the ordering gets thrown out that I think
anybody who does care is likely to get disappointed).

-Peff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] update-server-info: avoid needless overwrites
  2019-05-12  4:08   ` Jeff King
@ 2019-05-12  7:16     ` Ævar Arnfjörð Bjarmason
  2019-05-14  9:47       ` Jeff King
  0 siblings, 1 reply; 26+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-05-12  7:16 UTC (permalink / raw)
  To: Jeff King; +Cc: Eric Wong, git, Junio C Hamano


On Sun, May 12 2019, Jeff King wrote:

> On Sun, May 12, 2019 at 01:37:55AM +0200, Ævar Arnfjörð Bjarmason wrote:
>
>> This way of doing it just seems so weirdly convoluted. Read them one at
>> a time, compute the SHA-1, just to see if they're different. Why not
>> something closer to a plain memcmp():
>
> FWIW, I had the exact same thought on reading the patch. Checking sizes
> seems like an easy optimization and I don't mind it, but computing
> hashes when we could just compare the bytes seems pointless. I'd have
> expected to stream 4k blocks as we go.
>
>> I.e. optimze for code simplicity with something close to a dumb "cmp
>> --silent". I'm going to make the bold claim that nobody using "dumb
>> http" is going to be impacted by the performance of reading their
>> for-each-ref or for-each-pack dump, hence opting for not even bothing to
>> stat() to get the size before reading.
>
> You're probably right (especially because we'd just spent O(n) work
> generating the candidate file). But note that I have seen pathological
> cases where info/refs was gigabytes.

Wouldn't those users be calling update-server-info from something like a
post-receive hook where they *know* the refs/packs just got updated?

Well, there is "transfer.unpackLimit" to contend with, but that's just
for "packs are same", not "refs are same", and that file's presumably
much smaller.

So I'd think that *if* this is a case where there's even a question of
the performance mattering here, and I'd assume "refs changed but size is
the same" is a common case (e.g. craploads or tags, but just "master"
pushed-to) we could make this an opt-in "--update-if-changed", because
the "if changed" is extra work, and we're already documenting that you
should be calling this on the "I know it changed" hook path.

I don't care even if we use the initial SHA-1 method. It just struck me
as odd to conflate the reasonable "don't update mtime for optimizing
dumb over-the-wire client" with "optimize resource use when updating".

>> >  /*
>> >   * Create the file "path" by writing to a temporary file and renaming
>> >   * it into place. The contents of the file come from "generate", which
>> >   * should return non-zero if it encounters an error.
>> >   */
>> > -static int update_info_file(char *path, int (*generate)(FILE *))
>> > +static int update_info_file(char *path, int (*generate)(FILE *), int force)
>> >  {
>> >  	char *tmp = mkpathdup("%s_XXXXXX", path);
>>
>> Unrelated to this, patch, but I hadn't thought about this nasty race
>> condition. We recommend users run this from the "post-update" (or
>> "post-receive") hook, and don't juggle the lock along with the ref
>> update, thus due to the vagaries of scheduling you can end up with two
>> concurrent ref updates where the "old" one wins.
>>
>> But I guess that brings me back to something close to "nobody with that
>> sort of update rate is using 'dumb http'" :)
>>
>> For this to work properly we'd need to take some sort of global "ref
>> update/pack update" lock, and I guess at that point this "cmp" case
>> would be a helper similar to commit_lock_file_to(),
>> i.e. a commit_lock_file_to_if_different().
>
> I don't think our usual dot-locking is great here. What does the
> race-loser do when it can't take the lock? It can't just skip its
> update, since it needs to make sure that its new pack is mentioned.

Right, I mean a *global* .git/I_AM_DOING_WRITES.lock, because there's no
way to square the ref backend's notion of per-ref ref lock enabling
concurrent pushes with update-server-info's desire to generate metadata
showing *all* the refs.

> So it has to wait and poll until the other process finishes. I guess
> maybe that isn't the end of the world.

If "its" is update-server-info this won't work. It's possible for two
update-server-info processes to be racing in such a way that their
for_each_ref() reads a ref at a given value that'll be updated 1
millisecond later, but to then have that update's update-server-info
"win" the race to update the info files (hypothetical locks on those
info files and all).

Thus the "info" files will be updated to old values, since we'll see we
need changes, but change things to the old values.

All things that *can* be dealt with in some clever ways, but I think
just further proof nobody's using this for anything serious :)

I.e. the "commit A happened before B" but also "commit B's post-* hooks
finished after A" is a thing that happens quite frequently (per my
logs).

>> Then in b3223761c8 ("update_info_refs(): drop unused force parameter",
>> 2019-04-05) Jeff removed the unused-for-a-decade "force" param.
>>
>> So having gone through the history I think we're better off just
>> dropping the --force argument entirely, or at least changing the
>> docs.
>>
>> Before this change the only thing it was doing was pruning stuff we
>> haven't written since 2005-ish (see 3e15c67c90 ("server-info: throw away
>> T computation as well.", 2005-12-04)), rather than "detect if useless"
>> we should just write out the file again, and then skip if changed
>> (i.e. this logic).
>
> Yeah, my commit only ripped out the useless force parameter for
> info/refs. For info/packs, there's still that weird "is is stale"
> computation (which I fixed several bugs in). It's not entirely clear to
> me if that can just go away, but I agree that if we can it's simpler and
> more desirable to just generate the candidate result and see if it's
> bit-for-bit identical or not.
>
> I'm not entirely sure of all of the magic that "stale" check is trying
> to accomplish. I think there's some bits in there that try to preserve
> the existing ordering, but I don't know why anyone would care (and there
> are so many cases where the ordering gets thrown out that I think
> anybody who does care is likely to get disappointed).

My reading of it is that it's premature optimization that can go away
(and most of it has already), for "it's cheap" and "if not it's called
from the 'I know I had an update'" hook case, as noted above.<

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3] update-server-info: avoid needless overwrites
  2019-05-11  1:34 [PATCH] update-server-info: avoid needless overwrites Eric Wong
                   ` (2 preceding siblings ...)
  2019-05-11 23:37 ` Ævar Arnfjörð Bjarmason
@ 2019-05-13 23:17 ` Eric Wong
  3 siblings, 0 replies; 26+ messages in thread
From: Eric Wong @ 2019-05-13 23:17 UTC (permalink / raw)
  To: git; +Cc: Eric Sunshine, Ævar Arnfjörð Bjarmason, Jeff King

Do not change the existing info/refs and objects/info/packs
files if they match the existing content on the filesystem.
This is intended to preserve mtime and make it easier for dumb
HTTP pollers to rely on the If-Modified-Since header.

Combined with stdio and kernel buffering; the kernel should be
able to avoid block layer writes and reduce wear for small files.

As a result, the --force option is no longer needed.  So stop
documenting it, but let it remain for compatibility (and
debugging, if necessary).

v3: perform incremental comparison while generating to avoid
    OOM with giant files.  Remove documentation for --force.

Signed-off-by: Eric Wong <e@80x24.org>
---
  OK, performing the incremental comparison wasn't nearly as
  bad as thought it'd be.  I might be suffering from
  compiler-linker-slowness PTSD :x

Interdiff:
  diff --git a/Documentation/git-update-server-info.txt b/Documentation/git-update-server-info.txt
  index bd0e36492f..969bb2e15f 100644
  --- a/Documentation/git-update-server-info.txt
  +++ b/Documentation/git-update-server-info.txt
  @@ -9,7 +9,7 @@ git-update-server-info - Update auxiliary info file to help dumb servers
   SYNOPSIS
   --------
   [verse]
  -'git update-server-info' [--force]
  +'git update-server-info'
   
   DESCRIPTION
   -----------
  @@ -19,15 +19,6 @@ $GIT_OBJECT_DIRECTORY/info directories to help clients discover
   what references and packs the server has.  This command
   generates such auxiliary files.
   
  -
  -OPTIONS
  --------
  -
  --f::
  ---force::
  -	Update the info files from scratch.
  -
  -
   OUTPUT
   ------
   
  diff --git a/server-info.c b/server-info.c
  index 11515804d4..e68f785c2f 100644
  --- a/server-info.c
  +++ b/server-info.c
  @@ -8,41 +8,54 @@
   #include "object-store.h"
   #include "strbuf.h"
   
  -static int files_differ(FILE *fp, const char *path)
  +struct update_info_ctx {
  +	FILE *cur_fp;
  +	FILE *old_fp; /* becomes NULL if it differs from cur_fp */
  +	struct strbuf cur_sb;
  +	struct strbuf old_sb;
  +};
  +
  +static void uic_mark_stale(struct update_info_ctx *uic)
   {
  -	struct stat st;
  -	git_hash_ctx c;
  -	struct object_id oid_old, oid_new;
  -	struct strbuf tmp = STRBUF_INIT;
  -	long new_len = ftell(fp);
  +	fclose(uic->old_fp);
  +	uic->old_fp = NULL;
  +}
   
  -	if (new_len < 0 || stat(path, &st) < 0)
  -		return 1;
  -	if (!S_ISREG(st.st_mode))
  -		return 1;
  -	if ((off_t)new_len != st.st_size)
  -		return 1;
  +static int uic_is_stale(const struct update_info_ctx *uic)
  +{
  +	return uic->old_fp == NULL;
  +}
   
  -	rewind(fp);
  -	if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len) {
  -		strbuf_release(&tmp);
  -		return 1;
  -	}
  -	the_hash_algo->init_fn(&c);
  -	the_hash_algo->update_fn(&c, tmp.buf, tmp.len);
  -	the_hash_algo->final_fn(oid_new.hash, &c);
  -	strbuf_release(&tmp);
  +static int uic_printf(struct update_info_ctx *uic, const char *fmt, ...)
  +{
  +	va_list ap;
  +	int ret = -1;
   
  -	if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0) {
  -		strbuf_release(&tmp);
  -		return 1;
  +	va_start(ap, fmt);
  +
  +	if (uic_is_stale(uic)) {
  +		ret = vfprintf(uic->cur_fp, fmt, ap);
  +	} else {
  +		ssize_t r;
  +		struct strbuf *cur = &uic->cur_sb;
  +		struct strbuf *old = &uic->old_sb;
  +
  +		strbuf_reset(cur);
  +		strbuf_vinsertf(cur, 0, fmt, ap);
  +
  +		strbuf_reset(old);
  +		strbuf_grow(old, cur->len);
  +		r = fread(old->buf, 1, cur->len, uic->old_fp);
  +		if (r != cur->len || memcmp(old->buf, cur->buf, r))
  +			uic_mark_stale(uic);
  +
  +		if (fwrite(cur->buf, 1, cur->len, uic->cur_fp) == cur->len)
  +			ret = 0;
   	}
  -	the_hash_algo->init_fn(&c);
  -	the_hash_algo->update_fn(&c, tmp.buf, tmp.len);
  -	the_hash_algo->final_fn(oid_old.hash, &c);
  -	strbuf_release(&tmp);
   
  -	return hashcmp(oid_old.hash, oid_new.hash);
  +	va_end(ap);
  +
  +	return ret;
   }
   
   /*
  @@ -50,31 +63,61 @@ static int files_differ(FILE *fp, const char *path)
    * it into place. The contents of the file come from "generate", which
    * should return non-zero if it encounters an error.
    */
  -static int update_info_file(char *path, int (*generate)(FILE *), int force)
  +static int update_info_file(char *path,
  +			int (*generate)(struct update_info_ctx *),
  +			int force)
   {
   	char *tmp = mkpathdup("%s_XXXXXX", path);
   	int ret = -1;
   	int fd = -1;
  -	FILE *fp = NULL, *to_close;
  -	int do_update;
  +	FILE *to_close;
  +	struct update_info_ctx uic = {
  +		.cur_fp = NULL,
  +		.old_fp = NULL,
  +		.cur_sb = STRBUF_INIT,
  +		.old_sb = STRBUF_INIT
  +	};
   
   	safe_create_leading_directories(path);
   	fd = git_mkstemp_mode(tmp, 0666);
   	if (fd < 0)
   		goto out;
  -	to_close = fp = fdopen(fd, "w+");
  -	if (!fp)
  +	to_close = uic.cur_fp = fdopen(fd, "w");
  +	if (!uic.cur_fp)
   		goto out;
   	fd = -1;
  -	ret = generate(fp);
  +
  +	/* no problem on ENOENT and old_fp == NULL, it's stale, now */
  +	if (!force)
  +		uic.old_fp = fopen_or_warn(path, "r");
  +
  +	/*
  +	 * uic_printf will compare incremental comparison aginst old_fp
  +	 * and mark uic as stale if needed
  +	 */
  +	ret = generate(&uic);
   	if (ret)
   		goto out;
   
  -	do_update = force || files_differ(fp, path);
  -	fp = NULL;
  +	/* new file may be shorter than the old one, check here */
  +	if (!uic_is_stale(&uic)) {
  +		struct stat st;
  +		long new_len = ftell(uic.cur_fp);
  +		int old_fd = fileno(uic.old_fp);
  +
  +		if (new_len < 0) {
  +			ret = -1;
  +			goto out;
  +		}
  +		if (fstat(old_fd, &st) || (st.st_size != (size_t)new_len))
  +			uic_mark_stale(&uic);
  +	}
  +
  +	uic.cur_fp = NULL;
   	if (fclose(to_close))
   		goto out;
  -	if (do_update) {
  +
  +	if (uic_is_stale(&uic)) {
   		if (adjust_shared_perm(tmp) < 0)
   			goto out;
   		if (rename(tmp, path) < 0)
  @@ -87,40 +130,44 @@ static int update_info_file(char *path, int (*generate)(FILE *), int force)
   out:
   	if (ret) {
   		error_errno("unable to update %s", path);
  -		if (fp)
  -			fclose(fp);
  +		if (uic.cur_fp)
  +			fclose(uic.cur_fp);
   		else if (fd >= 0)
   			close(fd);
   		unlink(tmp);
   	}
   	free(tmp);
  +	if (uic.old_fp)
  +		fclose(uic.old_fp);
  +	strbuf_release(&uic.old_sb);
  +	strbuf_release(&uic.cur_sb);
   	return ret;
   }
   
   static int add_info_ref(const char *path, const struct object_id *oid,
   			int flag, void *cb_data)
   {
  -	FILE *fp = cb_data;
  +	struct update_info_ctx *uic = cb_data;
   	struct object *o = parse_object(the_repository, oid);
   	if (!o)
   		return -1;
   
  -	if (fprintf(fp, "%s	%s\n", oid_to_hex(oid), path) < 0)
  +	if (uic_printf(uic, "%s	%s\n", oid_to_hex(oid), path) < 0)
   		return -1;
   
   	if (o->type == OBJ_TAG) {
   		o = deref_tag(the_repository, o, path, 0);
   		if (o)
  -			if (fprintf(fp, "%s	%s^{}\n",
  +			if (uic_printf(uic, "%s	%s^{}\n",
   				oid_to_hex(&o->oid), path) < 0)
   				return -1;
   	}
   	return 0;
   }
   
  -static int generate_info_refs(FILE *fp)
  +static int generate_info_refs(struct update_info_ctx *uic)
   {
  -	return for_each_ref(add_info_ref, fp);
  +	return for_each_ref(add_info_ref, uic);
   }
   
   static int update_info_refs(int force)
  @@ -281,14 +328,14 @@ static void free_pack_info(void)
   	free(info);
   }
   
  -static int write_pack_info_file(FILE *fp)
  +static int write_pack_info_file(struct update_info_ctx *uic)
   {
   	int i;
   	for (i = 0; i < num_pack; i++) {
  -		if (fprintf(fp, "P %s\n", pack_basename(info[i]->p)) < 0)
  +		if (uic_printf(uic, "P %s\n", pack_basename(info[i]->p)) < 0)
   			return -1;
   	}
  -	if (fputc('\n', fp) == EOF)
  +	if (uic_printf(uic, "\n") < 0)
   		return -1;
   	return 0;
   }

 Documentation/git-update-server-info.txt |  11 +-
 server-info.c                            | 140 +++++++++++++++++++----
 t/t5200-update-server-info.sh            |  41 +++++++
 3 files changed, 158 insertions(+), 34 deletions(-)
 create mode 100755 t/t5200-update-server-info.sh

diff --git a/Documentation/git-update-server-info.txt b/Documentation/git-update-server-info.txt
index bd0e36492f..969bb2e15f 100644
--- a/Documentation/git-update-server-info.txt
+++ b/Documentation/git-update-server-info.txt
@@ -9,7 +9,7 @@ git-update-server-info - Update auxiliary info file to help dumb servers
 SYNOPSIS
 --------
 [verse]
-'git update-server-info' [--force]
+'git update-server-info'
 
 DESCRIPTION
 -----------
@@ -19,15 +19,6 @@ $GIT_OBJECT_DIRECTORY/info directories to help clients discover
 what references and packs the server has.  This command
 generates such auxiliary files.
 
-
-OPTIONS
--------
-
--f::
---force::
-	Update the info files from scratch.
-
-
 OUTPUT
 ------
 
diff --git a/server-info.c b/server-info.c
index 41274d098b..e68f785c2f 100644
--- a/server-info.c
+++ b/server-info.c
@@ -6,82 +6,174 @@
 #include "tag.h"
 #include "packfile.h"
 #include "object-store.h"
+#include "strbuf.h"
+
+struct update_info_ctx {
+	FILE *cur_fp;
+	FILE *old_fp; /* becomes NULL if it differs from cur_fp */
+	struct strbuf cur_sb;
+	struct strbuf old_sb;
+};
+
+static void uic_mark_stale(struct update_info_ctx *uic)
+{
+	fclose(uic->old_fp);
+	uic->old_fp = NULL;
+}
+
+static int uic_is_stale(const struct update_info_ctx *uic)
+{
+	return uic->old_fp == NULL;
+}
+
+static int uic_printf(struct update_info_ctx *uic, const char *fmt, ...)
+{
+	va_list ap;
+	int ret = -1;
+
+	va_start(ap, fmt);
+
+	if (uic_is_stale(uic)) {
+		ret = vfprintf(uic->cur_fp, fmt, ap);
+	} else {
+		ssize_t r;
+		struct strbuf *cur = &uic->cur_sb;
+		struct strbuf *old = &uic->old_sb;
+
+		strbuf_reset(cur);
+		strbuf_vinsertf(cur, 0, fmt, ap);
+
+		strbuf_reset(old);
+		strbuf_grow(old, cur->len);
+		r = fread(old->buf, 1, cur->len, uic->old_fp);
+		if (r != cur->len || memcmp(old->buf, cur->buf, r))
+			uic_mark_stale(uic);
+
+		if (fwrite(cur->buf, 1, cur->len, uic->cur_fp) == cur->len)
+			ret = 0;
+	}
+
+	va_end(ap);
+
+	return ret;
+}
 
 /*
  * Create the file "path" by writing to a temporary file and renaming
  * it into place. The contents of the file come from "generate", which
  * should return non-zero if it encounters an error.
  */
-static int update_info_file(char *path, int (*generate)(FILE *))
+static int update_info_file(char *path,
+			int (*generate)(struct update_info_ctx *),
+			int force)
 {
 	char *tmp = mkpathdup("%s_XXXXXX", path);
 	int ret = -1;
 	int fd = -1;
-	FILE *fp = NULL, *to_close;
+	FILE *to_close;
+	struct update_info_ctx uic = {
+		.cur_fp = NULL,
+		.old_fp = NULL,
+		.cur_sb = STRBUF_INIT,
+		.old_sb = STRBUF_INIT
+	};
 
 	safe_create_leading_directories(path);
 	fd = git_mkstemp_mode(tmp, 0666);
 	if (fd < 0)
 		goto out;
-	to_close = fp = fdopen(fd, "w");
-	if (!fp)
+	to_close = uic.cur_fp = fdopen(fd, "w");
+	if (!uic.cur_fp)
 		goto out;
 	fd = -1;
-	ret = generate(fp);
+
+	/* no problem on ENOENT and old_fp == NULL, it's stale, now */
+	if (!force)
+		uic.old_fp = fopen_or_warn(path, "r");
+
+	/*
+	 * uic_printf will compare incremental comparison aginst old_fp
+	 * and mark uic as stale if needed
+	 */
+	ret = generate(&uic);
 	if (ret)
 		goto out;
-	fp = NULL;
+
+	/* new file may be shorter than the old one, check here */
+	if (!uic_is_stale(&uic)) {
+		struct stat st;
+		long new_len = ftell(uic.cur_fp);
+		int old_fd = fileno(uic.old_fp);
+
+		if (new_len < 0) {
+			ret = -1;
+			goto out;
+		}
+		if (fstat(old_fd, &st) || (st.st_size != (size_t)new_len))
+			uic_mark_stale(&uic);
+	}
+
+	uic.cur_fp = NULL;
 	if (fclose(to_close))
 		goto out;
-	if (adjust_shared_perm(tmp) < 0)
-		goto out;
-	if (rename(tmp, path) < 0)
-		goto out;
+
+	if (uic_is_stale(&uic)) {
+		if (adjust_shared_perm(tmp) < 0)
+			goto out;
+		if (rename(tmp, path) < 0)
+			goto out;
+	} else {
+		unlink(tmp);
+	}
 	ret = 0;
 
 out:
 	if (ret) {
 		error_errno("unable to update %s", path);
-		if (fp)
-			fclose(fp);
+		if (uic.cur_fp)
+			fclose(uic.cur_fp);
 		else if (fd >= 0)
 			close(fd);
 		unlink(tmp);
 	}
 	free(tmp);
+	if (uic.old_fp)
+		fclose(uic.old_fp);
+	strbuf_release(&uic.old_sb);
+	strbuf_release(&uic.cur_sb);
 	return ret;
 }
 
 static int add_info_ref(const char *path, const struct object_id *oid,
 			int flag, void *cb_data)
 {
-	FILE *fp = cb_data;
+	struct update_info_ctx *uic = cb_data;
 	struct object *o = parse_object(the_repository, oid);
 	if (!o)
 		return -1;
 
-	if (fprintf(fp, "%s	%s\n", oid_to_hex(oid), path) < 0)
+	if (uic_printf(uic, "%s	%s\n", oid_to_hex(oid), path) < 0)
 		return -1;
 
 	if (o->type == OBJ_TAG) {
 		o = deref_tag(the_repository, o, path, 0);
 		if (o)
-			if (fprintf(fp, "%s	%s^{}\n",
+			if (uic_printf(uic, "%s	%s^{}\n",
 				oid_to_hex(&o->oid), path) < 0)
 				return -1;
 	}
 	return 0;
 }
 
-static int generate_info_refs(FILE *fp)
+static int generate_info_refs(struct update_info_ctx *uic)
 {
-	return for_each_ref(add_info_ref, fp);
+	return for_each_ref(add_info_ref, uic);
 }
 
-static int update_info_refs(void)
+static int update_info_refs(int force)
 {
 	char *path = git_pathdup("info/refs");
-	int ret = update_info_file(path, generate_info_refs);
+	int ret = update_info_file(path, generate_info_refs, force);
 	free(path);
 	return ret;
 }
@@ -236,14 +328,14 @@ static void free_pack_info(void)
 	free(info);
 }
 
-static int write_pack_info_file(FILE *fp)
+static int write_pack_info_file(struct update_info_ctx *uic)
 {
 	int i;
 	for (i = 0; i < num_pack; i++) {
-		if (fprintf(fp, "P %s\n", pack_basename(info[i]->p)) < 0)
+		if (uic_printf(uic, "P %s\n", pack_basename(info[i]->p)) < 0)
 			return -1;
 	}
-	if (fputc('\n', fp) == EOF)
+	if (uic_printf(uic, "\n") < 0)
 		return -1;
 	return 0;
 }
@@ -254,7 +346,7 @@ static int update_info_packs(int force)
 	int ret;
 
 	init_pack_info(infofile, force);
-	ret = update_info_file(infofile, write_pack_info_file);
+	ret = update_info_file(infofile, write_pack_info_file, force);
 	free_pack_info();
 	free(infofile);
 	return ret;
@@ -269,7 +361,7 @@ int update_server_info(int force)
 	 */
 	int errs = 0;
 
-	errs = errs | update_info_refs();
+	errs = errs | update_info_refs(force);
 	errs = errs | update_info_packs(force);
 
 	/* remove leftover rev-cache file if there is any */
diff --git a/t/t5200-update-server-info.sh b/t/t5200-update-server-info.sh
new file mode 100755
index 0000000000..21a58eecb9
--- /dev/null
+++ b/t/t5200-update-server-info.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+test_description='Test git update-server-info'
+
+. ./test-lib.sh
+
+test_expect_success 'setup' 'test_commit file'
+
+test_expect_success 'create info/refs' '
+	git update-server-info &&
+	test_path_is_file .git/info/refs
+'
+
+test_expect_success 'modify and store mtime' '
+	test-tool chmtime =0 .git/info/refs &&
+	test-tool chmtime --get .git/info/refs >a
+'
+
+test_expect_success 'info/refs is not needlessly overwritten' '
+	git update-server-info &&
+	test-tool chmtime --get .git/info/refs >b &&
+	test_cmp a b
+'
+
+test_expect_success 'info/refs can be forced to update' '
+	git update-server-info -f &&
+	test-tool chmtime --get .git/info/refs >b &&
+	! test_cmp a b
+'
+
+test_expect_success 'info/refs updates when changes are made' '
+	test-tool chmtime =0 .git/info/refs &&
+	test-tool chmtime --get .git/info/refs >b &&
+	test_cmp a b &&
+	git update-ref refs/heads/foo HEAD &&
+	git update-server-info &&
+	test-tool chmtime --get .git/info/refs >b &&
+	! test_cmp a b
+'
+
+test_done
-- 
EW

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] update-server-info: avoid needless overwrites
  2019-05-12  7:16     ` Ævar Arnfjörð Bjarmason
@ 2019-05-14  9:47       ` Jeff King
  2019-05-14 10:33         ` Ævar Arnfjörð Bjarmason
  2019-05-14 11:50         ` Eric Wong
  0 siblings, 2 replies; 26+ messages in thread
From: Jeff King @ 2019-05-14  9:47 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason; +Cc: Eric Wong, git, Junio C Hamano

On Sun, May 12, 2019 at 09:16:31AM +0200, Ævar Arnfjörð Bjarmason wrote:

> > You're probably right (especially because we'd just spent O(n) work
> > generating the candidate file). But note that I have seen pathological
> > cases where info/refs was gigabytes.
> 
> Wouldn't those users be calling update-server-info from something like a
> post-receive hook where they *know* the refs/packs just got updated?
>
> Well, there is "transfer.unpackLimit" to contend with, but that's just
> for "packs are same", not "refs are same", and that file's presumably
> much smaller.

Yeah, I think there's sort of an open question here of who is calling
update-server-info when nothing got updated. I think the only place we
call it automatically is via receive-pack, but I'd guess Eric runs it as
part of public-inbox scripts.

I agree that this is probably mostly about info/packs. Not every push
(or repo update) will create one, but every push _should_ be changing a
ref (if it succeeds at all).  And I'd guess Eric's interest comes from
the use of Git in public-inbox, which is going to make frequent but
small updates.

So this does seem like a really niche case; it avoids one single HTTP
request of a small that should generally be small (unless you're letting
your pack list grow really big, but I think there are other issues with
that) in a case that we know will generate a bunch of other HTTP traffic
(if somebody updated the server info, there was indeed a push, so you'd
get a refreshed info/refs and presumably the new loose objects).

That said, the logic to preserve the mtime is pretty straightforward, so
I don't mind it too much if Eric finds this really improves things for
him.

> > I don't think our usual dot-locking is great here. What does the
> > race-loser do when it can't take the lock? It can't just skip its
> > update, since it needs to make sure that its new pack is mentioned.
> 
> Right, I mean a *global* .git/I_AM_DOING_WRITES.lock, because there's no
> way to square the ref backend's notion of per-ref ref lock enabling
> concurrent pushes with update-server-info's desire to generate metadata
> showing *all* the refs.

OK. I agree that would work, but it's nasty to delay user-initiated
operations for ongoing maintenance (another obvious place to want such a
lock is for pruning, which can take quite a while).

> > So it has to wait and poll until the other process finishes. I guess
> > maybe that isn't the end of the world.
> 
> If "its" is update-server-info this won't work. It's possible for two
> update-server-info processes to be racing in such a way that their
> for_each_ref() reads a ref at a given value that'll be updated 1
> millisecond later, but to then have that update's update-server-info
> "win" the race to update the info files (hypothetical locks on those
> info files and all).
> 
> Thus the "info" files will be updated to old values, since we'll see we
> need changes, but change things to the old values.
> 
> All things that *can* be dealt with in some clever ways, but I think
> just further proof nobody's using this for anything serious :)
> 
> I.e. the "commit A happened before B" but also "commit B's post-* hooks
> finished after A" is a thing that happens quite frequently (per my
> logs).

I think it would work because any update-server-info, whether from A or
B, will take into account the full current repo state (and we don't look
at that state until we take the lock). So you might get an interleaved
"A-push, B-push, B-maint, A-maint", but that's OK. A-maint will
represent B's state when it runs.

> > I'm not entirely sure of all of the magic that "stale" check is trying
> > to accomplish. I think there's some bits in there that try to preserve
> > the existing ordering, but I don't know why anyone would care (and there
> > are so many cases where the ordering gets thrown out that I think
> > anybody who does care is likely to get disappointed).
> 
> My reading of it is that it's premature optimization that can go away
> (and most of it has already), for "it's cheap" and "if not it's called
> from the 'I know I had an update'" hook case, as noted above.<

That's my reading, too, but I didn't want to be responsible for
regressing some obscure case. At least Eric seems to _use_
update-server-info. ;)

-Peff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] update-server-info: avoid needless overwrites
  2019-05-14  9:47       ` Jeff King
@ 2019-05-14 10:33         ` Ævar Arnfjörð Bjarmason
  2019-05-14 11:24           ` Jeff King
  2019-05-14 11:50         ` Eric Wong
  1 sibling, 1 reply; 26+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-05-14 10:33 UTC (permalink / raw)
  To: Jeff King; +Cc: Eric Wong, git, Junio C Hamano


On Tue, May 14 2019, Jeff King wrote:

> On Sun, May 12, 2019 at 09:16:31AM +0200, Ævar Arnfjörð Bjarmason wrote:
>
>> > You're probably right (especially because we'd just spent O(n) work
>> > generating the candidate file). But note that I have seen pathological
>> > cases where info/refs was gigabytes.
>>
>> Wouldn't those users be calling update-server-info from something like a
>> post-receive hook where they *know* the refs/packs just got updated?
>>
>> Well, there is "transfer.unpackLimit" to contend with, but that's just
>> for "packs are same", not "refs are same", and that file's presumably
>> much smaller.
>
> Yeah, I think there's sort of an open question here of who is calling
> update-server-info when nothing got updated. I think the only place we
> call it automatically is via receive-pack, but I'd guess Eric runs it as
> part of public-inbox scripts.
>
> I agree that this is probably mostly about info/packs. Not every push
> (or repo update) will create one, but every push _should_ be changing a
> ref (if it succeeds at all).  And I'd guess Eric's interest comes from
> the use of Git in public-inbox, which is going to make frequent but
> small updates.
>
> So this does seem like a really niche case; it avoids one single HTTP
> request of a small that should generally be small (unless you're letting
> your pack list grow really big, but I think there are other issues with
> that) in a case that we know will generate a bunch of other HTTP traffic
> (if somebody updated the server info, there was indeed a push, so you'd
> get a refreshed info/refs and presumably the new loose objects).
>
> That said, the logic to preserve the mtime is pretty straightforward, so
> I don't mind it too much if Eric finds this really improves things for
> him.
>
>> > I don't think our usual dot-locking is great here. What does the
>> > race-loser do when it can't take the lock? It can't just skip its
>> > update, since it needs to make sure that its new pack is mentioned.
>>
>> Right, I mean a *global* .git/I_AM_DOING_WRITES.lock, because there's no
>> way to square the ref backend's notion of per-ref ref lock enabling
>> concurrent pushes with update-server-info's desire to generate metadata
>> showing *all* the refs.
>
> OK. I agree that would work, but it's nasty to delay user-initiated
> operations for ongoing maintenance (another obvious place to want such a
> lock is for pruning, which can take quite a while).
>
>> > So it has to wait and poll until the other process finishes. I guess
>> > maybe that isn't the end of the world.
>>
>> If "its" is update-server-info this won't work. It's possible for two
>> update-server-info processes to be racing in such a way that their
>> for_each_ref() reads a ref at a given value that'll be updated 1
>> millisecond later, but to then have that update's update-server-info
>> "win" the race to update the info files (hypothetical locks on those
>> info files and all).
>>
>> Thus the "info" files will be updated to old values, since we'll see we
>> need changes, but change things to the old values.
>>
>> All things that *can* be dealt with in some clever ways, but I think
>> just further proof nobody's using this for anything serious :)
>>
>> I.e. the "commit A happened before B" but also "commit B's post-* hooks
>> finished after A" is a thing that happens quite frequently (per my
>> logs).
>
> I think it would work because any update-server-info, whether from A or
> B, will take into account the full current repo state (and we don't look
> at that state until we take the lock). So you might get an interleaved
> "A-push, B-push, B-maint, A-maint", but that's OK. A-maint will
> represent B's state when it runs.

Maybe we're talking about different things. I mean the following
sequence:

 1. Refs "X" and "Y" are at X=A Y=A
 2. Concurrent push #1 happens, updating X from A..F
 3. Concurrent push #2 happens, updating Y from A..F
 4. Concurrent push #1 succeeds
 5. Concurrent push #1 starts update-server-info. Reads X=F Y=A
 5. Concurrent push #2 succeeds
 6. Concurrent push #2 starts update-server-info. Reads X=F Y=F
 7. Concurrent push #2's update-server-info finishes, X=F Y=F written to "info"
 8. Concurrent push #1's update-server-info finishes, X=A Y=F written to "info"

I.e. because we have per-ref locks and no lock at all on
update-server-info (but that would need to be a global ref lock, not
just on the "info" files) we can have a push that's already read "X"'s
value as "A" while updating "Y" win the race against an
update-server-info that updated "X"'s value to "F".

It will get fixed on the next push (at least as far as "X"'s value
goes), but until that time dumb clients will falsely see that "X" hasn't
been updated.

>> > I'm not entirely sure of all of the magic that "stale" check is trying
>> > to accomplish. I think there's some bits in there that try to preserve
>> > the existing ordering, but I don't know why anyone would care (and there
>> > are so many cases where the ordering gets thrown out that I think
>> > anybody who does care is likely to get disappointed).
>>
>> My reading of it is that it's premature optimization that can go away
>> (and most of it has already), for "it's cheap" and "if not it's called
>> from the 'I know I had an update'" hook case, as noted above.<
>
> That's my reading, too, but I didn't want to be responsible for
> regressing some obscure case. At least Eric seems to _use_
> update-server-info. ;)

Yeah I agree with that sentiment. I don't use it. I was just wondering
if for those who still want it this wasn't a relatively obscure
use-case, so it should perhaps be hidden behind an option if there were
optimization concerns.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] update-server-info: avoid needless overwrites
  2019-05-14 10:33         ` Ævar Arnfjörð Bjarmason
@ 2019-05-14 11:24           ` Jeff King
  2019-05-14 11:57             ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff King @ 2019-05-14 11:24 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason; +Cc: Eric Wong, git, Junio C Hamano

On Tue, May 14, 2019 at 12:33:11PM +0200, Ævar Arnfjörð Bjarmason wrote:

> > I think it would work because any update-server-info, whether from A or
> > B, will take into account the full current repo state (and we don't look
> > at that state until we take the lock). So you might get an interleaved
> > "A-push, B-push, B-maint, A-maint", but that's OK. A-maint will
> > represent B's state when it runs.
> 
> Maybe we're talking about different things. I mean the following
> sequence:
> 
>  1. Refs "X" and "Y" are at X=A Y=A
>  2. Concurrent push #1 happens, updating X from A..F
>  3. Concurrent push #2 happens, updating Y from A..F
>  4. Concurrent push #1 succeeds
>  5. Concurrent push #1 starts update-server-info. Reads X=F Y=A
>  5. Concurrent push #2 succeeds
>  6. Concurrent push #2 starts update-server-info. Reads X=F Y=F
>  7. Concurrent push #2's update-server-info finishes, X=F Y=F written to "info"
>  8. Concurrent push #1's update-server-info finishes, X=A Y=F written to "info"
> 
> I.e. because we have per-ref locks and no lock at all on
> update-server-info (but that would need to be a global ref lock, not
> just on the "info" files) we can have a push that's already read "X"'s
> value as "A" while updating "Y" win the race against an
> update-server-info that updated "X"'s value to "F".
> 
> It will get fixed on the next push (at least as far as "X"'s value
> goes), but until that time dumb clients will falsely see that "X" hasn't
> been updated.

That's the same situation. But I thought we were talking about having an
update-server-info lock. In which case the #2 update-server-info or the
#1 update-server-info runs in its entirety, and cannot have their read
and write steps interleaved (that's what I meant by "don't look at the
state until we take the lock"). Then that gives us a strict ordering: we
know that _some_ update-server-info (be it #1 or #2's) will run after
any given update.

-Peff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] update-server-info: avoid needless overwrites
  2019-05-14  9:47       ` Jeff King
  2019-05-14 10:33         ` Ævar Arnfjörð Bjarmason
@ 2019-05-14 11:50         ` Eric Wong
  2019-05-14 12:13           ` dumb HTTP things I want to do Eric Wong
  2019-05-14 12:19           ` [PATCH] update-server-info: avoid needless overwrites Ævar Arnfjörð Bjarmason
  1 sibling, 2 replies; 26+ messages in thread
From: Eric Wong @ 2019-05-14 11:50 UTC (permalink / raw)
  To: Jeff King; +Cc: Ævar Arnfjörð Bjarmason, git, Junio C Hamano

Jeff King <peff@peff.net> wrote:
> Yeah, I think there's sort of an open question here of who is calling
> update-server-info when nothing got updated. I think the only place we
> call it automatically is via receive-pack, but I'd guess Eric runs it as
> part of public-inbox scripts.

Correct.  post-update doesn't get run because public-inbox
relies on fast-import.  I have mirrors using "git fetch", which
also doesn't call post-update, either so I was calling
update-server-info in my mirror script.

Since more people have taken an interest in mirroring things,
I figured I'd make "public-inbox-index" (the script which
updates the Xapian and SQLite indices) call update-server-info
itself.

That way, it's simpler to mirror (v1) inboxes:

  git fetch && git update-server-info && public-inbox-index

becomes:

  git fetch && public-inbox-index

That's a huge savings in cognitive overhead.

So, my eventual goal for this is we get to the point where any
git operation which changes refs will automatically update
info/refs if it exists.

Ditto for objects/info/packs on any pack changes.

This, like my bitmap-by-default change is among the things
I'm trying to make it easier for more users to start
self-hosting (including mirroring) any type of git repo.

Anyways, I am far from knowledgeable about the locking
discussion for git, though :x

> That's my reading, too, but I didn't want to be responsible for
> regressing some obscure case. At least Eric seems to _use_
> update-server-info. ;)

I also have something else on my mind for abusing info files with :>
(another email)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] update-server-info: avoid needless overwrites
  2019-05-14 11:24           ` Jeff King
@ 2019-05-14 11:57             ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 26+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-05-14 11:57 UTC (permalink / raw)
  To: Jeff King; +Cc: Eric Wong, git, Junio C Hamano


On Tue, May 14 2019, Jeff King wrote:

> On Tue, May 14, 2019 at 12:33:11PM +0200, Ævar Arnfjörð Bjarmason wrote:
>
>> > I think it would work because any update-server-info, whether from A or
>> > B, will take into account the full current repo state (and we don't look
>> > at that state until we take the lock). So you might get an interleaved
>> > "A-push, B-push, B-maint, A-maint", but that's OK. A-maint will
>> > represent B's state when it runs.
>>
>> Maybe we're talking about different things. I mean the following
>> sequence:
>>
>>  1. Refs "X" and "Y" are at X=A Y=A
>>  2. Concurrent push #1 happens, updating X from A..F
>>  3. Concurrent push #2 happens, updating Y from A..F
>>  4. Concurrent push #1 succeeds
>>  5. Concurrent push #1 starts update-server-info. Reads X=F Y=A
>>  5. Concurrent push #2 succeeds
>>  6. Concurrent push #2 starts update-server-info. Reads X=F Y=F
>>  7. Concurrent push #2's update-server-info finishes, X=F Y=F written to "info"
>>  8. Concurrent push #1's update-server-info finishes, X=A Y=F written to "info"
>>
>> I.e. because we have per-ref locks and no lock at all on
>> update-server-info (but that would need to be a global ref lock, not
>> just on the "info" files) we can have a push that's already read "X"'s
>> value as "A" while updating "Y" win the race against an
>> update-server-info that updated "X"'s value to "F".
>>
>> It will get fixed on the next push (at least as far as "X"'s value
>> goes), but until that time dumb clients will falsely see that "X" hasn't
>> been updated.
>
> That's the same situation. But I thought we were talking about having an
> update-server-info lock. In which case the #2 update-server-info or the
> #1 update-server-info runs in its entirety, and cannot have their read
> and write steps interleaved (that's what I meant by "don't look at the
> state until we take the lock"). Then that gives us a strict ordering: we
> know that _some_ update-server-info (be it #1 or #2's) will run after
> any given update.

Yeah you're right. I *thought* in my last E-mail we were talking about
the current state, but re-reading upthread I see that was a fail on my
part.

An update-server-info lock would solve this indeed. We could still end
up with a situation where whatever a naïve version of the lockfile API
would fail for the "new" update since the old one was underway, so we'd
need something similar to core.*Ref*Timeout, but if we ran into a *.lock
or the timeout we could exit non-zero, as opposed to silently failing
like it does now when it races.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* dumb HTTP things I want to do
  2019-05-14 11:50         ` Eric Wong
@ 2019-05-14 12:13           ` Eric Wong
  2019-05-14 12:27             ` Jeff King
  2019-05-14 12:19           ` [PATCH] update-server-info: avoid needless overwrites Ævar Arnfjörð Bjarmason
  1 sibling, 1 reply; 26+ messages in thread
From: Eric Wong @ 2019-05-14 12:13 UTC (permalink / raw)
  To: Jeff King; +Cc: Ævar Arnfjörð Bjarmason, git, Junio C Hamano

Eric Wong <e@80x24.org> wrote:
> Jeff King <peff@peff.net> wrote:
> > That's my reading, too, but I didn't want to be responsible for
> > regressing some obscure case. At least Eric seems to _use_
> > update-server-info. ;)
> 
> I also have something else on my mind for abusing info files with :>
> (another email)

I'm not sure when/if I'll have time for this; but this ought to
be possible:

	GIT_DIR=$HTTP_URL git <any read-only command>

And possible without existing admins to setup or change
anything on their server.

Right now, I could do it by setting up a WebDAV server
and using fusedav[1] on the client.

But, not everybody runs a WebDAV server which allows PROPFIND
for listing files...  However, info/refs and objects/info/packs
can give us all the info we need without needing PROPFIND.  All
we'd need is the common GET/HEAD HTTP methods for read-only
access.

git doesn't need mmap; and curl + Range requests ought to be
able to get us what we need to emulate pread.  It'd be great for
low-latency LANs, maybe not so great with high latency; but
probably better in many cases than cloning a giant repo to cat
one blob.

Also, cloning on a static bundle ought to be doable with:

	git clone $REMOTE_OR_LOCAL_PATH/foo.bundle

And yeah, it also sucks that bundles double storage overhead
for admins; it would be nice if I could use bundles as alternates
or packs...

Anyways, all of this is probably a lot of work and I don't hack
much, anymore.

[1] I have many patches for fusedav, and the debian maintainer
    seems dead, and upstream's moved on: https://bugs.debian.org/fusedav
    davfs2 can't do Range requests, so it won't work for big repos...

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] update-server-info: avoid needless overwrites
  2019-05-14 11:50         ` Eric Wong
  2019-05-14 12:13           ` dumb HTTP things I want to do Eric Wong
@ 2019-05-14 12:19           ` Ævar Arnfjörð Bjarmason
  2019-05-14 12:29             ` Jeff King
  2019-05-15  0:45             ` [PATCH 2/1] server-info: conditionally update on fetch Eric Wong
  1 sibling, 2 replies; 26+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-05-14 12:19 UTC (permalink / raw)
  To: Eric Wong; +Cc: Jeff King, git, Junio C Hamano


On Tue, May 14 2019, Eric Wong wrote:

> Jeff King <peff@peff.net> wrote:
>> Yeah, I think there's sort of an open question here of who is calling
>> update-server-info when nothing got updated. I think the only place we
>> call it automatically is via receive-pack, but I'd guess Eric runs it as
>> part of public-inbox scripts.
>
> Correct.  post-update doesn't get run because public-inbox
> relies on fast-import.  I have mirrors using "git fetch", which
> also doesn't call post-update, either so I was calling
> update-server-info in my mirror script.
>
> Since more people have taken an interest in mirroring things,
> I figured I'd make "public-inbox-index" (the script which
> updates the Xapian and SQLite indices) call update-server-info
> itself.
>
> That way, it's simpler to mirror (v1) inboxes:
>
>   git fetch && git update-server-info && public-inbox-index
>
> becomes:
>
>   git fetch && public-inbox-index
>
> That's a huge savings in cognitive overhead.
>
> So, my eventual goal for this is we get to the point where any
> git operation which changes refs will automatically update
> info/refs if it exists.
>
> Ditto for objects/info/packs on any pack changes.
>
> This, like my bitmap-by-default change is among the things
> I'm trying to make it easier for more users to start
> self-hosting (including mirroring) any type of git repo.
>
> Anyways, I am far from knowledgeable about the locking
> discussion for git, though :x
>
>> That's my reading, too, but I didn't want to be responsible for
>> regressing some obscure case. At least Eric seems to _use_
>> update-server-info. ;)
>
> I also have something else on my mind for abusing info files with :>
> (another email)

Aside from this change, I wonder if making "fetch" optionally "exit 1"
if no refs were updated would be useful, as in the below WIP. Of course
it would be better to distinguish errors from "no refs to update".

I've hacked around it not doing that in the past for similar "fetch and
index" things by looking at for-each-ref before/after, or only seeing if
HEAD changed (so this wouldn't be a general solution...):

diff --git a/builtin/fetch.c b/builtin/fetch.c
index 4ba63d5ac6..da5414d9db 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -47,6 +47,7 @@ static int prune_tags = -1; /* unspecified */
 #define PRUNE_TAGS_BY_DEFAULT 0 /* do we prune tags by default? */

 static int all, append, dry_run, force, keep, multiple, update_head_ok, verbosity, deepen_relative;
+static int exit_code;
 static int progress = -1;
 static int tags = TAGS_DEFAULT, unshallow, update_shallow, deepen;
 static int max_children = 1;
@@ -66,6 +67,7 @@ static struct refspec refmap = REFSPEC_INIT_FETCH;
 static struct list_objects_filter_options filter_options;
 static struct string_list server_options = STRING_LIST_INIT_DUP;
 static struct string_list negotiation_tip = STRING_LIST_INIT_NODUP;
+static int updated_refs;

 static int git_fetch_config(const char *k, const char *v, void *cb)
 {
@@ -133,6 +135,8 @@ static struct option builtin_fetch_options[] = {
 	{ OPTION_CALLBACK, 0, "recurse-submodules", &recurse_submodules, N_("on-demand"),
 		    N_("control recursive fetching of submodules"),
 		    PARSE_OPT_OPTARG, option_fetch_parse_recurse_submodules },
+	OPT_BOOL(0, "exit-code", &exit_code,
+		 N_("exit successfully if refs are updated")),
 	OPT_BOOL(0, "dry-run", &dry_run,
 		 N_("dry run")),
 	OPT_BOOL('k', "keep", &keep, N_("keep downloaded pack")),
@@ -522,8 +526,10 @@ static int s_update_ref(const char *action,
 	struct strbuf err = STRBUF_INIT;
 	int ret, df_conflict = 0;

-	if (dry_run)
+	if (dry_run) {
+		updated_refs++;
 		return 0;
+	}
 	if (!rla)
 		rla = default_rla.buf;
 	msg = xstrfmt("%s: %s", rla, action);
@@ -545,6 +551,7 @@ static int s_update_ref(const char *action,
 	ref_transaction_free(transaction);
 	strbuf_release(&err);
 	free(msg);
+	updated_refs++;
 	return 0;
 fail:
 	ref_transaction_free(transaction);
@@ -1680,5 +1687,9 @@ int cmd_fetch(int argc, const char **argv, const char *prefix)
 	run_command_v_opt(argv_gc_auto.argv, RUN_GIT_CMD);
 	argv_array_clear(&argv_gc_auto);

-	return result;
+	if (result)
+		return result;
+	if (exit_code)
+		return !updated_refs;
+	return 0;
 }

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: dumb HTTP things I want to do
  2019-05-14 12:13           ` dumb HTTP things I want to do Eric Wong
@ 2019-05-14 12:27             ` Jeff King
  0 siblings, 0 replies; 26+ messages in thread
From: Jeff King @ 2019-05-14 12:27 UTC (permalink / raw)
  To: Eric Wong; +Cc: Ævar Arnfjörð Bjarmason, git, Junio C Hamano

On Tue, May 14, 2019 at 12:13:50PM +0000, Eric Wong wrote:

> I'm not sure when/if I'll have time for this; but this ought to
> be possible:
> 
> 	GIT_DIR=$HTTP_URL git <any read-only command>
> 
> And possible without existing admins to setup or change
> anything on their server.
> [...]
> git doesn't need mmap; and curl + Range requests ought to be
> able to get us what we need to emulate pread.  It'd be great for
> low-latency LANs, maybe not so great with high latency; but
> probably better in many cases than cloning a giant repo to cat
> one blob.

My first thought here is that we could probably just use the filesystem
boundary as the appropriate layer, and have an httpfs fuse driver. Your
mention of fusedav makes me think you've already gone this route.

I guess non-dav http doesn't necessarily let us enumerate directories.
But it might be enough if we could insert ourselves in a few key spots.
E.g., when we try to opendir("objects/packs") and it fails with ENOSYS
or similar, then we fallback to opening "objects/info/packs" to get the
same information (and ditto for loose ref traversal).

There'd still be _some_ work in Git, but it wouldn't require replacing
every single read with HTTP magic.

> Also, cloning on a static bundle ought to be doable with:
> 
> 	git clone $REMOTE_OR_LOCAL_PATH/foo.bundle

Some nearby discussion:

  https://public-inbox.org/git/20190514092900.GA11679@sigill.intra.peff.net/

> And yeah, it also sucks that bundles double storage overhead
> for admins; it would be nice if I could use bundles as alternates
> or packs...

Yes. It's nice that they're a single file for some uses, but in terms of
implementation it would be easier if the bundle files just said "here
are some refs, and you can find my packfile in the relative filename
XYZ.pack". And then you'd just store the two of them together (and a
matching .idx if you wanted to use it as a real pack, but the bundle
readers wouldn't care).

It probably wouldn't be that hard to wedge that into the bundle format
by bumping the version field.

-Peff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] update-server-info: avoid needless overwrites
  2019-05-14 12:19           ` [PATCH] update-server-info: avoid needless overwrites Ævar Arnfjörð Bjarmason
@ 2019-05-14 12:29             ` Jeff King
  2019-05-15  0:45             ` [PATCH 2/1] server-info: conditionally update on fetch Eric Wong
  1 sibling, 0 replies; 26+ messages in thread
From: Jeff King @ 2019-05-14 12:29 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason; +Cc: Eric Wong, git, Junio C Hamano

On Tue, May 14, 2019 at 02:19:36PM +0200, Ævar Arnfjörð Bjarmason wrote:

> Aside from this change, I wonder if making "fetch" optionally "exit 1"
> if no refs were updated would be useful, as in the below WIP. Of course
> it would be better to distinguish errors from "no refs to update".

That seems very sensible to me, as long as it's an optional flag as you
have it here.

> @@ -133,6 +135,8 @@ static struct option builtin_fetch_options[] = {
>  	{ OPTION_CALLBACK, 0, "recurse-submodules", &recurse_submodules, N_("on-demand"),
>  		    N_("control recursive fetching of submodules"),
>  		    PARSE_OPT_OPTARG, option_fetch_parse_recurse_submodules },
> +	OPT_BOOL(0, "exit-code", &exit_code,
> +		 N_("exit successfully if refs are updated")),

I think it makes sense to flip this explanation: "exit non-zero if no
refs are updated" or "treat it as an error if no refs are updated".
Since the default behavior is already to exit successfully in this case. :)

We may also want to advertise a particular result code so callers can
distinguish it from regular die() errors.

-Peff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 2/1] server-info: conditionally update on fetch
  2019-05-14 12:19           ` [PATCH] update-server-info: avoid needless overwrites Ævar Arnfjörð Bjarmason
  2019-05-14 12:29             ` Jeff King
@ 2019-05-15  0:45             ` Eric Wong
  2019-05-15 20:38               ` [WIP] repack leaving stale entries in objects/info/packs Eric Wong
  1 sibling, 1 reply; 26+ messages in thread
From: Eric Wong @ 2019-05-15  0:45 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason; +Cc: Jeff King, git, Junio C Hamano

Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
> Aside from this change, I wonder if making "fetch" optionally "exit 1"
> if no refs were updated would be useful, as in the below WIP. Of course
> it would be better to distinguish errors from "no refs to update".

Yes, we should've had this feature all along :)

And it's easy for me to build off your WIP to have fetch
update server info iff info/refs already exists:

-------8<-------
Subject: [PATCH 2/1] server-info: conditionally update on fetch

Since fetch can invalidate existing server info files, use the
new `updated_refs' counter to update server info files iff
info/refs already exists.

Note: this depends on Ævar's WIP in:
	https://public-inbox.org/git/87ftphw7mv.fsf@evledraar.gmail.com/

Signed-off-by: Eric Wong <e@80x24.org>
---
 builtin/fetch.c        |  3 +++
 server-info.c          | 22 +++++++++++++++++++---
 t/t5513-fetch-track.sh | 21 +++++++++++++++++++++
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index da5414d9db..b35d4d105d 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -1681,6 +1681,9 @@ int cmd_fetch(int argc, const char **argv, const char *prefix)
 
 	close_all_packs(the_repository->objects);
 
+	if (updated_refs)
+		update_server_info(-1);
+
 	argv_array_pushl(&argv_gc_auto, "gc", "--auto", NULL);
 	if (verbosity < 0)
 		argv_array_push(&argv_gc_auto, "--quiet");
diff --git a/server-info.c b/server-info.c
index e68f785c2f..d4065d56a3 100644
--- a/server-info.c
+++ b/server-info.c
@@ -7,6 +7,7 @@
 #include "packfile.h"
 #include "object-store.h"
 #include "strbuf.h"
+#include "dir.h"
 
 struct update_info_ctx {
 	FILE *cur_fp;
@@ -170,10 +171,25 @@ static int generate_info_refs(struct update_info_ctx *uic)
 	return for_each_ref(add_info_ref, uic);
 }
 
-static int update_info_refs(int force)
+static int want_update(int *force, const char *path)
+{
+	if (*force < 0) {
+		if (file_exists(path))
+			*force = 0; /* continue to normal update */
+		else
+			return 0;
+	}
+	return 1;
+}
+
+static int update_info_refs(int *force)
 {
 	char *path = git_pathdup("info/refs");
-	int ret = update_info_file(path, generate_info_refs, force);
+	int ret = 0;
+
+	if (want_update(force, path))
+		ret = update_info_file(path, generate_info_refs, *force);
+
 	free(path);
 	return ret;
 }
@@ -361,7 +377,7 @@ int update_server_info(int force)
 	 */
 	int errs = 0;
 
-	errs = errs | update_info_refs(force);
+	errs = errs | update_info_refs(&force);
 	errs = errs | update_info_packs(force);
 
 	/* remove leftover rev-cache file if there is any */
diff --git a/t/t5513-fetch-track.sh b/t/t5513-fetch-track.sh
index 65d1e05bd6..421f16ddfd 100755
--- a/t/t5513-fetch-track.sh
+++ b/t/t5513-fetch-track.sh
@@ -27,4 +27,25 @@ test_expect_success fetch '
 	)
 '
 
+test_expect_success 'info/refs not created by fetch' '
+	(
+		cd other &&
+		test_path_is_dir .git/info &&
+		! test_path_is_file .git/info/refs
+	)
+'
+
+test_expect_success 'info/refs updated by fetch if it already exists' '
+	git branch b/for-info-refs &&
+	(
+		cd other &&
+		git update-server-info &&
+		test_path_is_file .git/info/refs &&
+		! grep b/for-info-refs .git/info/refs &&
+		git fetch &&
+		test_path_is_file .git/info/refs &&
+		grep b/for-info-refs .git/info/refs
+	)
+'
+
 test_done
-- 
EW

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [WIP] repack leaving stale entries in objects/info/packs
  2019-05-15  0:45             ` [PATCH 2/1] server-info: conditionally update on fetch Eric Wong
@ 2019-05-15 20:38               ` Eric Wong
  2019-05-15 21:48                 ` Jeff King
  0 siblings, 1 reply; 26+ messages in thread
From: Eric Wong @ 2019-05-15 20:38 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Junio C Hamano, Ævar Arnfjörð Bjarmason

I've also noticed objects/info/packs contains stale entries
after repack/gc runs on current git.

Tried adding reprepare_packed_git before update_server_info,
but that didn't seem to work; so maybe something isn't cleared.
Might have time to investigate more this week, might not...

diff --git a/builtin/repack.c b/builtin/repack.c
index 67f8978043..ec46f2099a 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -559,8 +559,10 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 			prune_shallow(PRUNE_QUICK);
 	}
 
-	if (!no_update_server_info)
+	if (!no_update_server_info) {
+		reprepare_packed_git(the_repository);
 		update_server_info(0);
+	}
 	remove_temporary_files();
 
 	if (git_env_bool(GIT_TEST_MULTI_PACK_INDEX, 0))
diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh
index 7411bf7fec..867d4fb0c7 100755
--- a/t/t6500-gc.sh
+++ b/t/t6500-gc.sh
@@ -71,6 +71,7 @@ test_expect_success 'gc --keep-largest-pack' '
 		git gc --keep-largest-pack &&
 		( cd .git/objects/pack && ls *.pack ) >pack-list &&
 		test_line_count = 2 pack-list &&
+		test_line_count = 2 .git/objects/info/packs &&
 		test_path_is_file $BASE_PACK &&
 		git fsck
 	)


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [WIP] repack leaving stale entries in objects/info/packs
  2019-05-15 20:38               ` [WIP] repack leaving stale entries in objects/info/packs Eric Wong
@ 2019-05-15 21:48                 ` Jeff King
  2019-05-23  8:59                   ` [PATCH] server-info: do not list unlinked packs Eric Wong
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff King @ 2019-05-15 21:48 UTC (permalink / raw)
  To: Eric Wong; +Cc: git, Junio C Hamano, Ævar Arnfjörð Bjarmason

On Wed, May 15, 2019 at 08:38:39PM +0000, Eric Wong wrote:

> I've also noticed objects/info/packs contains stale entries
> after repack/gc runs on current git.
> 
> Tried adding reprepare_packed_git before update_server_info,
> but that didn't seem to work; so maybe something isn't cleared.
> Might have time to investigate more this week, might not...

We never delete entries from the in-memory packed_git list; a reprepare
only adds to the list. You'd need to teach update_server_info() to
ignore packs which are no longer present (or switch to exec-ing a
separate update-server-info binary).

-Peff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH] server-info: do not list unlinked packs
  2019-05-15 21:48                 ` Jeff King
@ 2019-05-23  8:59                   ` Eric Wong
  2019-05-23 10:24                     ` Jeff King
  0 siblings, 1 reply; 26+ messages in thread
From: Eric Wong @ 2019-05-23  8:59 UTC (permalink / raw)
  To: Jeff King; +Cc: git, Junio C Hamano, Ævar Arnfjörð Bjarmason

Jeff King <peff@peff.net> wrote:
> On Wed, May 15, 2019 at 08:38:39PM +0000, Eric Wong wrote:
> 
> > I've also noticed objects/info/packs contains stale entries
> > after repack/gc runs on current git.
> > 
> > Tried adding reprepare_packed_git before update_server_info,
> > but that didn't seem to work; so maybe something isn't cleared.
> > Might have time to investigate more this week, might not...
> 
> We never delete entries from the in-memory packed_git list; a reprepare
> only adds to the list. You'd need to teach update_server_info() to
> ignore packs which are no longer present (or switch to exec-ing a
> separate update-server-info binary).

Ah, checking files_exists() and setting a bit seems sufficient.

--------8<---------
Subject: [PATCH] server-info: do not list unlinked packs

Having non-existent packs in objects/info/packs causes
dumb HTTP clients to abort.

There remains a small window where the old objects/info/packs
file can refer to unlinked packs.  That's unavoidable even on a
local FS given the time-of-use-time-of-check window between
listing and retrieving files.

Signed-off-by: Eric Wong <e@80x24.org>
---
  I think the small window I refer to can be worked around by
  teaching the dumb HTTP client to reread objects/info/packs
  if it 404s while trying to GET a pack...

 object-store.h | 1 +
 server-info.c  | 7 ++++++-
 t/t6500-gc.sh  | 2 ++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/object-store.h b/object-store.h
index 272e01e452..2c9facc8f2 100644
--- a/object-store.h
+++ b/object-store.h
@@ -77,6 +77,7 @@ struct packed_git {
 		 freshened:1,
 		 do_not_close:1,
 		 pack_promisor:1,
+		 pack_unlinked:1,
 		 multi_pack_index:1;
 	unsigned char hash[GIT_MAX_RAWSZ];
 	struct revindex_entry *revindex;
diff --git a/server-info.c b/server-info.c
index 41274d098b..69e2c5279b 100644
--- a/server-info.c
+++ b/server-info.c
@@ -1,4 +1,5 @@
 #include "cache.h"
+#include "dir.h"
 #include "repository.h"
 #include "refs.h"
 #include "object.h"
@@ -199,12 +200,16 @@ static void init_pack_info(const char *infofile, int force)
 		 */
 		if (!p->pack_local)
 			continue;
+		if (!file_exists(p->pack_name)) {
+			p->pack_unlinked = 1;
+			continue;
+		}
 		i++;
 	}
 	num_pack = i;
 	info = xcalloc(num_pack, sizeof(struct pack_info *));
 	for (i = 0, p = get_all_packs(the_repository); p; p = p->next) {
-		if (!p->pack_local)
+		if (!p->pack_local || p->pack_unlinked)
 			continue;
 		assert(i < num_pack);
 		info[i] = xcalloc(1, sizeof(struct pack_info));
diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh
index 515c6735e9..c0f04dc6b0 100755
--- a/t/t6500-gc.sh
+++ b/t/t6500-gc.sh
@@ -71,6 +71,8 @@ test_expect_success 'gc --keep-largest-pack' '
 		git gc --keep-largest-pack &&
 		( cd .git/objects/pack && ls *.pack ) >pack-list &&
 		test_line_count = 2 pack-list &&
+		awk "/^P /{print \$2}" <.git/objects/info/packs >pack-info &&
+		test_line_count = 2 pack-info &&
 		test_path_is_file $BASE_PACK &&
 		git fsck
 	)

base-commit: aa25c82427ae70aebf3b8f970f2afd54e9a2a8c6
-- 
EW

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] server-info: do not list unlinked packs
  2019-05-23  8:59                   ` [PATCH] server-info: do not list unlinked packs Eric Wong
@ 2019-05-23 10:24                     ` Jeff King
  2019-05-23 17:27                       ` [PATCH v2] " Eric Wong
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff King @ 2019-05-23 10:24 UTC (permalink / raw)
  To: Eric Wong; +Cc: git, Junio C Hamano, Ævar Arnfjörð Bjarmason

On Thu, May 23, 2019 at 08:59:59AM +0000, Eric Wong wrote:

> > We never delete entries from the in-memory packed_git list; a reprepare
> > only adds to the list. You'd need to teach update_server_info() to
> > ignore packs which are no longer present (or switch to exec-ing a
> > separate update-server-info binary).
> 
> Ah, checking files_exists() and setting a bit seems sufficient.

Yes, though we do we even need to store the bit?

I.e.,

> @@ -199,12 +200,16 @@ static void init_pack_info(const char *infofile, int force)
>  		 */
>  		if (!p->pack_local)
>  			continue;
> +		if (!file_exists(p->pack_name)) {
> +			p->pack_unlinked = 1;
> +			continue;
> +		}
>  		i++;
>  	}
>  	num_pack = i;
>  	info = xcalloc(num_pack, sizeof(struct pack_info *));
>  	for (i = 0, p = get_all_packs(the_repository); p; p = p->next) {
> -		if (!p->pack_local)
> +		if (!p->pack_local || p->pack_unlinked)
>  			continue;
>  		assert(i < num_pack);
>  		info[i] = xcalloc(1, sizeof(struct pack_info));

If we just check file_exists() in the second loop, then this is entirely
local to update_server_info(). And other users of packed_git do not have
to wonder who is responsible for setting that flag in the global list.

It does mean you'd over-allocate the array (and num_pack would have to
be adjusted down to "i" after the second loop), but that's not a big
deal.  I do think the whole two-loop thing would be more readable if we
simply grew it on the fly with ALLOC_GROW().

-Peff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2] server-info: do not list unlinked packs
  2019-05-23 10:24                     ` Jeff King
@ 2019-05-23 17:27                       ` Eric Wong
  2019-05-24  6:05                         ` Jeff King
  2019-05-24  7:34                         ` Ævar Arnfjörð Bjarmason
  0 siblings, 2 replies; 26+ messages in thread
From: Eric Wong @ 2019-05-23 17:27 UTC (permalink / raw)
  To: Jeff King; +Cc: git, Junio C Hamano, Ævar Arnfjörð Bjarmason

Jeff King <peff@peff.net> wrote:
> On Thu, May 23, 2019 at 08:59:59AM +0000, Eric Wong wrote:
> 
> > > We never delete entries from the in-memory packed_git list; a reprepare
> > > only adds to the list. You'd need to teach update_server_info() to
> > > ignore packs which are no longer present (or switch to exec-ing a
> > > separate update-server-info binary).
> > 
> > Ah, checking files_exists() and setting a bit seems sufficient.
> 
> Yes, though we do we even need to store the bit?

I wanted to avoid the over-allocation, and I hit a bounds error
because I forgot to adjust num_pack as you mentioned below.

> I.e.,
> 
> > @@ -199,12 +200,16 @@ static void init_pack_info(const char *infofile, int force)
> >  		 */
> >  		if (!p->pack_local)
> >  			continue;
> > +		if (!file_exists(p->pack_name)) {
> > +			p->pack_unlinked = 1;
> > +			continue;
> > +		}
> >  		i++;
> >  	}
> >  	num_pack = i;
> >  	info = xcalloc(num_pack, sizeof(struct pack_info *));
> >  	for (i = 0, p = get_all_packs(the_repository); p; p = p->next) {
> > -		if (!p->pack_local)
> > +		if (!p->pack_local || p->pack_unlinked)
> >  			continue;
> >  		assert(i < num_pack);
> >  		info[i] = xcalloc(1, sizeof(struct pack_info));
> 
> If we just check file_exists() in the second loop, then this is entirely
> local to update_server_info(). And other users of packed_git do not have
> to wonder who is responsible for setting that flag in the global list.
> 
> It does mean you'd over-allocate the array (and num_pack would have to
> be adjusted down to "i" after the second loop), but that's not a big
> deal.  I do think the whole two-loop thing would be more readable if we
> simply grew it on the fly with ALLOC_GROW().

ALLOC_GROW makes the whole thing much nicer.
Thanks for the hint :>

---------------------8<---------------------
Subject: [PATCH] server-info: do not list unlinked packs

Having non-existent packs in objects/info/packs causes
dumb HTTP clients to abort.

v2: use single loop with ALLOC_GROW as suggested by Jeff King

Signed-off-by: Eric Wong <e@80x24.org>
Helped-by: Jeff King <peff@peff.net>
---
Interdiff:
  diff --git a/object-store.h b/object-store.h
  index 2c9facc8f2..272e01e452 100644
  --- a/object-store.h
  +++ b/object-store.h
  @@ -77,7 +77,6 @@ struct packed_git {
   		 freshened:1,
   		 do_not_close:1,
   		 pack_promisor:1,
  -		 pack_unlinked:1,
   		 multi_pack_index:1;
   	unsigned char hash[GIT_MAX_RAWSZ];
   	struct revindex_entry *revindex;
  diff --git a/server-info.c b/server-info.c
  index 69e2c5279b..92187c70db 100644
  --- a/server-info.c
  +++ b/server-info.c
  @@ -192,30 +192,21 @@ static void init_pack_info(const char *infofile, int force)
   {
   	struct packed_git *p;
   	int stale;
  -	int i = 0;
  +	int i;
  +	size_t alloc = 0;
   
   	for (p = get_all_packs(the_repository); p; p = p->next) {
   		/* we ignore things on alternate path since they are
   		 * not available to the pullers in general.
   		 */
  -		if (!p->pack_local)
  -			continue;
  -		if (!file_exists(p->pack_name)) {
  -			p->pack_unlinked = 1;
  -			continue;
  -		}
  -		i++;
  -	}
  -	num_pack = i;
  -	info = xcalloc(num_pack, sizeof(struct pack_info *));
  -	for (i = 0, p = get_all_packs(the_repository); p; p = p->next) {
  -		if (!p->pack_local || p->pack_unlinked)
  +		if (!p->pack_local || !file_exists(p->pack_name))
   			continue;
  -		assert(i < num_pack);
  +
  +		i = num_pack++;
  +		ALLOC_GROW(info, num_pack, alloc);
   		info[i] = xcalloc(1, sizeof(struct pack_info));
   		info[i]->p = p;
   		info[i]->old_num = -1;
  -		i++;
   	}
   
   	if (infofile && !force)

 server-info.c | 18 +++++++-----------
 t/t6500-gc.sh |  2 ++
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/server-info.c b/server-info.c
index 41274d098b..92187c70db 100644
--- a/server-info.c
+++ b/server-info.c
@@ -1,4 +1,5 @@
 #include "cache.h"
+#include "dir.h"
 #include "repository.h"
 #include "refs.h"
 #include "object.h"
@@ -191,26 +192,21 @@ static void init_pack_info(const char *infofile, int force)
 {
 	struct packed_git *p;
 	int stale;
-	int i = 0;
+	int i;
+	size_t alloc = 0;
 
 	for (p = get_all_packs(the_repository); p; p = p->next) {
 		/* we ignore things on alternate path since they are
 		 * not available to the pullers in general.
 		 */
-		if (!p->pack_local)
-			continue;
-		i++;
-	}
-	num_pack = i;
-	info = xcalloc(num_pack, sizeof(struct pack_info *));
-	for (i = 0, p = get_all_packs(the_repository); p; p = p->next) {
-		if (!p->pack_local)
+		if (!p->pack_local || !file_exists(p->pack_name))
 			continue;
-		assert(i < num_pack);
+
+		i = num_pack++;
+		ALLOC_GROW(info, num_pack, alloc);
 		info[i] = xcalloc(1, sizeof(struct pack_info));
 		info[i]->p = p;
 		info[i]->old_num = -1;
-		i++;
 	}
 
 	if (infofile && !force)
diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh
index 515c6735e9..c0f04dc6b0 100755
--- a/t/t6500-gc.sh
+++ b/t/t6500-gc.sh
@@ -71,6 +71,8 @@ test_expect_success 'gc --keep-largest-pack' '
 		git gc --keep-largest-pack &&
 		( cd .git/objects/pack && ls *.pack ) >pack-list &&
 		test_line_count = 2 pack-list &&
+		awk "/^P /{print \$2}" <.git/objects/info/packs >pack-info &&
+		test_line_count = 2 pack-info &&
 		test_path_is_file $BASE_PACK &&
 		git fsck
 	)
-- 
EW

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] server-info: do not list unlinked packs
  2019-05-23 17:27                       ` [PATCH v2] " Eric Wong
@ 2019-05-24  6:05                         ` Jeff King
  2019-05-24  7:34                         ` Ævar Arnfjörð Bjarmason
  1 sibling, 0 replies; 26+ messages in thread
From: Jeff King @ 2019-05-24  6:05 UTC (permalink / raw)
  To: Eric Wong; +Cc: git, Junio C Hamano, Ævar Arnfjörð Bjarmason

On Thu, May 23, 2019 at 05:27:23PM +0000, Eric Wong wrote:

> ALLOC_GROW makes the whole thing much nicer.
> Thanks for the hint :>
> 
> ---------------------8<---------------------
> Subject: [PATCH] server-info: do not list unlinked packs

This looks fine to me (though the "v2" note is in your commit message).
I'd probably break it into two separate patches, but I can live with it
either way.

-Peff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] server-info: do not list unlinked packs
  2019-05-23 17:27                       ` [PATCH v2] " Eric Wong
  2019-05-24  6:05                         ` Jeff King
@ 2019-05-24  7:34                         ` Ævar Arnfjörð Bjarmason
  1 sibling, 0 replies; 26+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-05-24  7:34 UTC (permalink / raw)
  To: Eric Wong; +Cc: Jeff King, git, Junio C Hamano


On Thu, May 23 2019, Eric Wong wrote:

> Jeff King <peff@peff.net> wrote:
>> On Thu, May 23, 2019 at 08:59:59AM +0000, Eric Wong wrote:
>>
>> > > We never delete entries from the in-memory packed_git list; a reprepare
>> > > only adds to the list. You'd need to teach update_server_info() to
>> > > ignore packs which are no longer present (or switch to exec-ing a
>> > > separate update-server-info binary).
>> >
>> > Ah, checking files_exists() and setting a bit seems sufficient.
>>
>> Yes, though we do we even need to store the bit?
>
> I wanted to avoid the over-allocation, and I hit a bounds error
> because I forgot to adjust num_pack as you mentioned
> below. [...]ALLOC_GROW makes the whole thing much nicer.

If you want to avoid over-allocation the last thing you want is
ALLOC_GROW() :)

I.e. see alloc_nr() in cache.h, we explicitly over-allocate with it.

But as our extensive use of it shows this sort of pattern is the right
trade-off, both in terms of performance on modern hardware, and code
readability in cases like this where we're never going to realistically
have to worry about memory pressure.

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2019-05-24  8:18 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-11  1:34 [PATCH] update-server-info: avoid needless overwrites Eric Wong
2019-05-11  7:35 ` Eric Sunshine
2019-05-11 20:47   ` [PATCH v2] " Eric Wong
2019-05-11 21:17 ` [PATCH] " Eric Wong
2019-05-11 23:37 ` Ævar Arnfjörð Bjarmason
2019-05-12  0:38   ` Eric Wong
2019-05-12  4:08   ` Jeff King
2019-05-12  7:16     ` Ævar Arnfjörð Bjarmason
2019-05-14  9:47       ` Jeff King
2019-05-14 10:33         ` Ævar Arnfjörð Bjarmason
2019-05-14 11:24           ` Jeff King
2019-05-14 11:57             ` Ævar Arnfjörð Bjarmason
2019-05-14 11:50         ` Eric Wong
2019-05-14 12:13           ` dumb HTTP things I want to do Eric Wong
2019-05-14 12:27             ` Jeff King
2019-05-14 12:19           ` [PATCH] update-server-info: avoid needless overwrites Ævar Arnfjörð Bjarmason
2019-05-14 12:29             ` Jeff King
2019-05-15  0:45             ` [PATCH 2/1] server-info: conditionally update on fetch Eric Wong
2019-05-15 20:38               ` [WIP] repack leaving stale entries in objects/info/packs Eric Wong
2019-05-15 21:48                 ` Jeff King
2019-05-23  8:59                   ` [PATCH] server-info: do not list unlinked packs Eric Wong
2019-05-23 10:24                     ` Jeff King
2019-05-23 17:27                       ` [PATCH v2] " Eric Wong
2019-05-24  6:05                         ` Jeff King
2019-05-24  7:34                         ` Ævar Arnfjörð Bjarmason
2019-05-13 23:17 ` [PATCH v3] update-server-info: avoid needless overwrites Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).