* [PATCH] update-server-info: avoid needless overwrites @ 2019-05-11 1:34 Eric Wong 2019-05-11 7:35 ` Eric Sunshine ` (3 more replies) 0 siblings, 4 replies; 26+ messages in thread From: Eric Wong @ 2019-05-11 1:34 UTC (permalink / raw) To: git Do not change the existing info/refs and objects/info/packs files if they match the existing content on the filesystem. This is intended to preserve mtime and make it easier for dumb HTTP pollers to rely on the If-Modified-Since header. Combined with stdio and kernel buffering; the kernel should be able to avoid block layer writes and reduce wear. Signed-off-by: Eric Wong <e@80x24.org> --- server-info.c | 61 +++++++++++++++++++++++++++++------ t/t5200-update-server-info.sh | 41 +++++++++++++++++++++++ 2 files changed, 92 insertions(+), 10 deletions(-) create mode 100755 t/t5200-update-server-info.sh diff --git a/server-info.c b/server-info.c index 41274d098b..34599e4817 100644 --- a/server-info.c +++ b/server-info.c @@ -6,37 +6,78 @@ #include "tag.h" #include "packfile.h" #include "object-store.h" +#include "strbuf.h" + +static int files_differ(FILE *fp, const char *path) +{ + struct stat st; + git_hash_ctx c; + struct object_id oid_old, oid_new; + struct strbuf tmp = STRBUF_INIT; + long new_len = ftell(fp); + + if (new_len < 0 || stat(path, &st) < 0) + return 1; + if (!S_ISREG(st.st_mode)) + return 1; + if ((off_t)new_len != st.st_size) + return 1; + + rewind(fp); + if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len) + return 1; + the_hash_algo->init_fn(&c); + the_hash_algo->update_fn(&c, tmp.buf, tmp.len); + the_hash_algo->final_fn(oid_new.hash, &c); + strbuf_release(&tmp); + + if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0) + return 1; + the_hash_algo->init_fn(&c); + the_hash_algo->update_fn(&c, tmp.buf, tmp.len); + the_hash_algo->final_fn(oid_old.hash, &c); + strbuf_release(&tmp); + + return hashcmp(oid_old.hash, oid_new.hash); +} /* * Create the file "path" by writing to a temporary file and renaming * it into place. The contents of the file come from "generate", which * should return non-zero if it encounters an error. */ -static int update_info_file(char *path, int (*generate)(FILE *)) +static int update_info_file(char *path, int (*generate)(FILE *), int force) { char *tmp = mkpathdup("%s_XXXXXX", path); int ret = -1; int fd = -1; FILE *fp = NULL, *to_close; + int do_update; safe_create_leading_directories(path); fd = git_mkstemp_mode(tmp, 0666); if (fd < 0) goto out; - to_close = fp = fdopen(fd, "w"); + to_close = fp = fdopen(fd, "w+"); if (!fp) goto out; fd = -1; ret = generate(fp); if (ret) goto out; + + do_update = force || files_differ(fp, path); fp = NULL; if (fclose(to_close)) goto out; - if (adjust_shared_perm(tmp) < 0) - goto out; - if (rename(tmp, path) < 0) - goto out; + if (do_update) { + if (adjust_shared_perm(tmp) < 0) + goto out; + if (rename(tmp, path) < 0) + goto out; + } else { + unlink(tmp); + } ret = 0; out: @@ -78,10 +119,10 @@ static int generate_info_refs(FILE *fp) return for_each_ref(add_info_ref, fp); } -static int update_info_refs(void) +static int update_info_refs(int force) { char *path = git_pathdup("info/refs"); - int ret = update_info_file(path, generate_info_refs); + int ret = update_info_file(path, generate_info_refs, force); free(path); return ret; } @@ -254,7 +295,7 @@ static int update_info_packs(int force) int ret; init_pack_info(infofile, force); - ret = update_info_file(infofile, write_pack_info_file); + ret = update_info_file(infofile, write_pack_info_file, force); free_pack_info(); free(infofile); return ret; @@ -269,7 +310,7 @@ int update_server_info(int force) */ int errs = 0; - errs = errs | update_info_refs(); + errs = errs | update_info_refs(force); errs = errs | update_info_packs(force); /* remove leftover rev-cache file if there is any */ diff --git a/t/t5200-update-server-info.sh b/t/t5200-update-server-info.sh new file mode 100755 index 0000000000..f1cec46914 --- /dev/null +++ b/t/t5200-update-server-info.sh @@ -0,0 +1,41 @@ +#!/bin/sh + +test_description='Test git stash show configuration.' + +. ./test-lib.sh + +test_expect_success 'setup' 'test_commit file' + +test_expect_success 'create info/refs' ' + git update-server-info && + test_path_is_file .git/info/refs +' + +test_expect_success 'modify and store mtime' ' + test-tool chmtime =0 .git/info/refs && + test-tool chmtime --get .git/info/refs >a +' + +test_expect_success 'info/refs is not needlessly overwritten' ' + git update-server-info && + test-tool chmtime --get .git/info/refs >b && + test_cmp a b +' + +test_expect_success 'info/refs can be forced to update' ' + git update-server-info -f && + test-tool chmtime --get .git/info/refs >b && + ! test_cmp a b +' + +test_expect_success 'info/refs updates when changes are made' ' + test-tool chmtime =0 .git/info/refs && + test-tool chmtime --get .git/info/refs >b && + test_cmp a b && + git update-ref refs/heads/foo HEAD && + git update-server-info && + test-tool chmtime --get .git/info/refs >b && + ! test_cmp a b +' + +test_done -- EW ^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCH] update-server-info: avoid needless overwrites 2019-05-11 1:34 [PATCH] update-server-info: avoid needless overwrites Eric Wong @ 2019-05-11 7:35 ` Eric Sunshine 2019-05-11 20:47 ` [PATCH v2] " Eric Wong 2019-05-11 21:17 ` [PATCH] " Eric Wong ` (2 subsequent siblings) 3 siblings, 1 reply; 26+ messages in thread From: Eric Sunshine @ 2019-05-11 7:35 UTC (permalink / raw) To: Eric Wong; +Cc: Git List On Fri, May 10, 2019 at 9:35 PM Eric Wong <e@80x24.org> wrote: > Do not change the existing info/refs and objects/info/packs > files if they match the existing content on the filesystem. > This is intended to preserve mtime and make it easier for dumb > HTTP pollers to rely on the If-Modified-Since header. > [...] > Signed-off-by: Eric Wong <e@80x24.org> > --- > diff --git a/server-info.c b/server-info.c > @@ -6,37 +6,78 @@ > +static int files_differ(FILE *fp, const char *path) > +{ > + [...] > + struct strbuf tmp = STRBUF_INIT; > + [...] > + if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len) > + return 1; Although strbuf_fread() will release 'tmp' automatically if the read actually fails, it won't do so otherwise. So, if it reads more or fewer bytes than expected, for some reason, then this early return will leak the strbuf, won't it? > + the_hash_algo->init_fn(&c); > + the_hash_algo->update_fn(&c, tmp.buf, tmp.len); > + the_hash_algo->final_fn(oid_new.hash, &c); > + strbuf_release(&tmp); > + > + if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0) > + return 1; Same issue. > diff --git a/t/t5200-update-server-info.sh b/t/t5200-update-server-info.sh > @@ -0,0 +1,41 @@ > +test_description='Test git stash show configuration.' Copy/paste error for test description? ^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v2] update-server-info: avoid needless overwrites 2019-05-11 7:35 ` Eric Sunshine @ 2019-05-11 20:47 ` Eric Wong 0 siblings, 0 replies; 26+ messages in thread From: Eric Wong @ 2019-05-11 20:47 UTC (permalink / raw) To: Eric Sunshine; +Cc: git Eric Sunshine <sunshine@sunshineco.com> wrote: > On Fri, May 10, 2019 at 9:35 PM Eric Wong <e@80x24.org> wrote: > > + if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len) > > + return 1; > > Although strbuf_fread() will release 'tmp' automatically if the read > actually fails, it won't do so otherwise. So, if it reads more or > fewer bytes than expected, for some reason, then this early return > will leak the strbuf, won't it? > > + if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0) > > + return 1; > > Same issue. > > > diff --git a/t/t5200-update-server-info.sh b/t/t5200-update-server-info.sh > > @@ -0,0 +1,41 @@ > > +test_description='Test git stash show configuration.' > > Copy/paste error for test description? Yup, thanks for the review. Updated patch below -----8<------ Subject: [PATCH] update-server-info: avoid needless overwrites Do not change the existing info/refs and objects/info/packs files if they match the existing content on the filesystem. This is intended to preserve mtime and make it easier for dumb HTTP pollers to rely on the If-Modified-Since header. Combined with stdio and kernel buffering; the kernel should be able to avoid block layer writes and reduce wear. Signed-off-by: Eric Wong <e@80x24.org> --- Interdiff: diff --git a/server-info.c b/server-info.c index 34599e4817..11515804d4 100644 --- a/server-info.c +++ b/server-info.c @@ -24,15 +24,19 @@ static int files_differ(FILE *fp, const char *path) return 1; rewind(fp); - if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len) + if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len) { + strbuf_release(&tmp); return 1; + } the_hash_algo->init_fn(&c); the_hash_algo->update_fn(&c, tmp.buf, tmp.len); the_hash_algo->final_fn(oid_new.hash, &c); strbuf_release(&tmp); - if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0) + if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0) { + strbuf_release(&tmp); return 1; + } the_hash_algo->init_fn(&c); the_hash_algo->update_fn(&c, tmp.buf, tmp.len); the_hash_algo->final_fn(oid_old.hash, &c); diff --git a/t/t5200-update-server-info.sh b/t/t5200-update-server-info.sh index f1cec46914..21a58eecb9 100755 --- a/t/t5200-update-server-info.sh +++ b/t/t5200-update-server-info.sh @@ -1,6 +1,6 @@ #!/bin/sh -test_description='Test git stash show configuration.' +test_description='Test git update-server-info' . ./test-lib.sh server-info.c | 65 +++++++++++++++++++++++++++++------ t/t5200-update-server-info.sh | 41 ++++++++++++++++++++++ 2 files changed, 96 insertions(+), 10 deletions(-) create mode 100755 t/t5200-update-server-info.sh diff --git a/server-info.c b/server-info.c index 41274d098b..11515804d4 100644 --- a/server-info.c +++ b/server-info.c @@ -6,37 +6,82 @@ #include "tag.h" #include "packfile.h" #include "object-store.h" +#include "strbuf.h" + +static int files_differ(FILE *fp, const char *path) +{ + struct stat st; + git_hash_ctx c; + struct object_id oid_old, oid_new; + struct strbuf tmp = STRBUF_INIT; + long new_len = ftell(fp); + + if (new_len < 0 || stat(path, &st) < 0) + return 1; + if (!S_ISREG(st.st_mode)) + return 1; + if ((off_t)new_len != st.st_size) + return 1; + + rewind(fp); + if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len) { + strbuf_release(&tmp); + return 1; + } + the_hash_algo->init_fn(&c); + the_hash_algo->update_fn(&c, tmp.buf, tmp.len); + the_hash_algo->final_fn(oid_new.hash, &c); + strbuf_release(&tmp); + + if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0) { + strbuf_release(&tmp); + return 1; + } + the_hash_algo->init_fn(&c); + the_hash_algo->update_fn(&c, tmp.buf, tmp.len); + the_hash_algo->final_fn(oid_old.hash, &c); + strbuf_release(&tmp); + + return hashcmp(oid_old.hash, oid_new.hash); +} /* * Create the file "path" by writing to a temporary file and renaming * it into place. The contents of the file come from "generate", which * should return non-zero if it encounters an error. */ -static int update_info_file(char *path, int (*generate)(FILE *)) +static int update_info_file(char *path, int (*generate)(FILE *), int force) { char *tmp = mkpathdup("%s_XXXXXX", path); int ret = -1; int fd = -1; FILE *fp = NULL, *to_close; + int do_update; safe_create_leading_directories(path); fd = git_mkstemp_mode(tmp, 0666); if (fd < 0) goto out; - to_close = fp = fdopen(fd, "w"); + to_close = fp = fdopen(fd, "w+"); if (!fp) goto out; fd = -1; ret = generate(fp); if (ret) goto out; + + do_update = force || files_differ(fp, path); fp = NULL; if (fclose(to_close)) goto out; - if (adjust_shared_perm(tmp) < 0) - goto out; - if (rename(tmp, path) < 0) - goto out; + if (do_update) { + if (adjust_shared_perm(tmp) < 0) + goto out; + if (rename(tmp, path) < 0) + goto out; + } else { + unlink(tmp); + } ret = 0; out: @@ -78,10 +123,10 @@ static int generate_info_refs(FILE *fp) return for_each_ref(add_info_ref, fp); } -static int update_info_refs(void) +static int update_info_refs(int force) { char *path = git_pathdup("info/refs"); - int ret = update_info_file(path, generate_info_refs); + int ret = update_info_file(path, generate_info_refs, force); free(path); return ret; } @@ -254,7 +299,7 @@ static int update_info_packs(int force) int ret; init_pack_info(infofile, force); - ret = update_info_file(infofile, write_pack_info_file); + ret = update_info_file(infofile, write_pack_info_file, force); free_pack_info(); free(infofile); return ret; @@ -269,7 +314,7 @@ int update_server_info(int force) */ int errs = 0; - errs = errs | update_info_refs(); + errs = errs | update_info_refs(force); errs = errs | update_info_packs(force); /* remove leftover rev-cache file if there is any */ diff --git a/t/t5200-update-server-info.sh b/t/t5200-update-server-info.sh new file mode 100755 index 0000000000..21a58eecb9 --- /dev/null +++ b/t/t5200-update-server-info.sh @@ -0,0 +1,41 @@ +#!/bin/sh + +test_description='Test git update-server-info' + +. ./test-lib.sh + +test_expect_success 'setup' 'test_commit file' + +test_expect_success 'create info/refs' ' + git update-server-info && + test_path_is_file .git/info/refs +' + +test_expect_success 'modify and store mtime' ' + test-tool chmtime =0 .git/info/refs && + test-tool chmtime --get .git/info/refs >a +' + +test_expect_success 'info/refs is not needlessly overwritten' ' + git update-server-info && + test-tool chmtime --get .git/info/refs >b && + test_cmp a b +' + +test_expect_success 'info/refs can be forced to update' ' + git update-server-info -f && + test-tool chmtime --get .git/info/refs >b && + ! test_cmp a b +' + +test_expect_success 'info/refs updates when changes are made' ' + test-tool chmtime =0 .git/info/refs && + test-tool chmtime --get .git/info/refs >b && + test_cmp a b && + git update-ref refs/heads/foo HEAD && + git update-server-info && + test-tool chmtime --get .git/info/refs >b && + ! test_cmp a b +' + +test_done -- EW ^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCH] update-server-info: avoid needless overwrites 2019-05-11 1:34 [PATCH] update-server-info: avoid needless overwrites Eric Wong 2019-05-11 7:35 ` Eric Sunshine @ 2019-05-11 21:17 ` Eric Wong 2019-05-11 23:37 ` Ævar Arnfjörð Bjarmason 2019-05-13 23:17 ` [PATCH v3] update-server-info: avoid needless overwrites Eric Wong 3 siblings, 0 replies; 26+ messages in thread From: Eric Wong @ 2019-05-11 21:17 UTC (permalink / raw) To: git Eric Wong <e@80x24.org> wrote: > Combined with stdio and kernel buffering; the kernel should be > able to avoid block layer writes and reduce wear. Along the same lines, I'd like to get repack to stop recreating identical packs at some point (but that could be months from now...) ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] update-server-info: avoid needless overwrites 2019-05-11 1:34 [PATCH] update-server-info: avoid needless overwrites Eric Wong 2019-05-11 7:35 ` Eric Sunshine 2019-05-11 21:17 ` [PATCH] " Eric Wong @ 2019-05-11 23:37 ` Ævar Arnfjörð Bjarmason 2019-05-12 0:38 ` Eric Wong 2019-05-12 4:08 ` Jeff King 2019-05-13 23:17 ` [PATCH v3] update-server-info: avoid needless overwrites Eric Wong 3 siblings, 2 replies; 26+ messages in thread From: Ævar Arnfjörð Bjarmason @ 2019-05-11 23:37 UTC (permalink / raw) To: Eric Wong; +Cc: git, Jeff King, Junio C Hamano On Sat, May 11 2019, Eric Wong wrote: > Do not change the existing info/refs and objects/info/packs > files if they match the existing content on the filesystem. > This is intended to preserve mtime and make it easier for dumb > HTTP pollers to rely on the If-Modified-Since header. > > Combined with stdio and kernel buffering; the kernel should be > able to avoid block layer writes and reduce wear. > > Signed-off-by: Eric Wong <e@80x24.org> > --- > server-info.c | 61 +++++++++++++++++++++++++++++------ > t/t5200-update-server-info.sh | 41 +++++++++++++++++++++++ > 2 files changed, 92 insertions(+), 10 deletions(-) > create mode 100755 t/t5200-update-server-info.sh > > diff --git a/server-info.c b/server-info.c > index 41274d098b..34599e4817 100644 > --- a/server-info.c > +++ b/server-info.c > @@ -6,37 +6,78 @@ > #include "tag.h" > #include "packfile.h" > #include "object-store.h" > +#include "strbuf.h" > + > +static int files_differ(FILE *fp, const char *path) > +{ > + struct stat st; > + git_hash_ctx c; > + struct object_id oid_old, oid_new; > + struct strbuf tmp = STRBUF_INIT; > + long new_len = ftell(fp); > + > + if (new_len < 0 || stat(path, &st) < 0) > + return 1; > + if (!S_ISREG(st.st_mode)) > + return 1; > + if ((off_t)new_len != st.st_size) > + return 1; > + > + rewind(fp); > + if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len) > + return 1; > + the_hash_algo->init_fn(&c); > + the_hash_algo->update_fn(&c, tmp.buf, tmp.len); > + the_hash_algo->final_fn(oid_new.hash, &c); > + strbuf_release(&tmp); > + > + if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0) > + return 1; > + the_hash_algo->init_fn(&c); > + the_hash_algo->update_fn(&c, tmp.buf, tmp.len); > + the_hash_algo->final_fn(oid_old.hash, &c); > + strbuf_release(&tmp); > + > + return hashcmp(oid_old.hash, oid_new.hash); > +} This way of doing it just seems so weirdly convoluted. Read them one at a time, compute the SHA-1, just to see if they're different. Why not something closer to a plain memcmp(): static int files_differ(FILE *fp, const char *path) { struct strbuf old = STRBUF_INIT, new = STRBUF_INIT; long new_len = ftell(fp); int diff = 1; rewind(fp); if (strbuf_fread(&new, (size_t)new_len, fp) != (size_t)new_len) goto release_return; if (strbuf_read_file(&old, path, 0) < 0) goto release_return; diff = strbuf_cmp(&old, &new); release_return: strbuf_release(&old); strbuf_release(&new); return diff; } I.e. optimze for code simplicity with something close to a dumb "cmp --silent". I'm going to make the bold claim that nobody using "dumb http" is going to be impacted by the performance of reading their for-each-ref or for-each-pack dump, hence opting for not even bothing to stat() to get the size before reading. Because really, if we were *trying* to micro-optimize this for time or memory use there's much better ways, e.g. reading the old file and memcmp() as we go and stream the "generate" callback, but I just don't see the point of trying in this case. > /* > * Create the file "path" by writing to a temporary file and renaming > * it into place. The contents of the file come from "generate", which > * should return non-zero if it encounters an error. > */ > -static int update_info_file(char *path, int (*generate)(FILE *)) > +static int update_info_file(char *path, int (*generate)(FILE *), int force) > { > char *tmp = mkpathdup("%s_XXXXXX", path); Unrelated to this, patch, but I hadn't thought about this nasty race condition. We recommend users run this from the "post-update" (or "post-receive") hook, and don't juggle the lock along with the ref update, thus due to the vagaries of scheduling you can end up with two concurrent ref updates where the "old" one wins. But I guess that brings me back to something close to "nobody with that sort of update rate is using 'dumb http'" :) For this to work properly we'd need to take some sort of global "ref update/pack update" lock, and I guess at that point this "cmp" case would be a helper similar to commit_lock_file_to(), i.e. a commit_lock_file_to_if_different(). > int ret = -1; > int fd = -1; > FILE *fp = NULL, *to_close; > + int do_update; > > safe_create_leading_directories(path); > fd = git_mkstemp_mode(tmp, 0666); > if (fd < 0) > goto out; > - to_close = fp = fdopen(fd, "w"); > + to_close = fp = fdopen(fd, "w+"); > if (!fp) > goto out; > fd = -1; > ret = generate(fp); > if (ret) > goto out; > + > + do_update = force || files_differ(fp, path); > [...] > > -static int update_info_refs(void) > +static int update_info_refs(int force) So, I was going to say "shouldn't we update the docs?" which for --force say "Update the info files from scratch.". But reading through it that "from scratch" wording is from c743e6e3c0 ("Add a link from update-server-info documentation to repository layout.", 2005-09-04). There it was a refrence to a bug since fixed in 60d0526aaa ("Unoptimize info/refs creation.", 2005-09-14), and then removed from the docs in c5fe5b6de9 ("Remove obsolete bug warning in man git-update-server-info", 2009-04-25). Then in b3223761c8 ("update_info_refs(): drop unused force parameter", 2019-04-05) Jeff removed the unused-for-a-decade "force" param. So having gone through the history I think we're better off just dropping the --force argument entirely, or at least changing the docs. Before this change the only thing it was doing was pruning stuff we haven't written since 2005-ish (see 3e15c67c90 ("server-info: throw away T computation as well.", 2005-12-04)), rather than "detect if useless" we should just write out the file again, and then skip if changed (i.e. this logic). That would also take care of the parse_pack_def() case by proxy, i.e. we don't need some "is stale if pack is missing" special case if we just always write out a new file (or not if it memcmp's the same as the "old" one), we want to change the file if the packs are updated anyway ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] update-server-info: avoid needless overwrites 2019-05-11 23:37 ` Ævar Arnfjörð Bjarmason @ 2019-05-12 0:38 ` Eric Wong 2019-05-12 4:08 ` Jeff King 1 sibling, 0 replies; 26+ messages in thread From: Eric Wong @ 2019-05-12 0:38 UTC (permalink / raw) To: Ævar Arnfjörð Bjarmason; +Cc: git, Jeff King, Junio C Hamano Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote: > On Sat, May 11 2019, Eric Wong wrote: > > +static int files_differ(FILE *fp, const char *path) > > +{ > > + struct stat st; > > + git_hash_ctx c; > > + struct object_id oid_old, oid_new; > > + struct strbuf tmp = STRBUF_INIT; > > + long new_len = ftell(fp); > > + > > + if (new_len < 0 || stat(path, &st) < 0) > > + return 1; > > + if (!S_ISREG(st.st_mode)) > > + return 1; > > + if ((off_t)new_len != st.st_size) > > + return 1; > > + > > + rewind(fp); > > + if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len) > > + return 1; > > + the_hash_algo->init_fn(&c); > > + the_hash_algo->update_fn(&c, tmp.buf, tmp.len); > > + the_hash_algo->final_fn(oid_new.hash, &c); > > + strbuf_release(&tmp); > > + > > + if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0) > > + return 1; > > + the_hash_algo->init_fn(&c); > > + the_hash_algo->update_fn(&c, tmp.buf, tmp.len); > > + the_hash_algo->final_fn(oid_old.hash, &c); > > + strbuf_release(&tmp); > > + > > + return hashcmp(oid_old.hash, oid_new.hash); > > +} > > This way of doing it just seems so weirdly convoluted. Read them one at > a time, compute the SHA-1, just to see if they're different. Why not > something closer to a plain memcmp(): > > static int files_differ(FILE *fp, const char *path) > { > struct strbuf old = STRBUF_INIT, new = STRBUF_INIT; > long new_len = ftell(fp); > int diff = 1; > > rewind(fp); > if (strbuf_fread(&new, (size_t)new_len, fp) != (size_t)new_len) > goto release_return; > if (strbuf_read_file(&old, path, 0) < 0) > goto release_return; > > diff = strbuf_cmp(&old, &new); > > release_return: > strbuf_release(&old); > strbuf_release(&new); > > return diff; > } > > I.e. optimze for code simplicity with something close to a dumb "cmp > --silent". I'm going to make the bold claim that nobody using "dumb > http" is going to be impacted by the performance of reading their > for-each-ref or for-each-pack dump, hence opting for not even bothing to > stat() to get the size before reading. I've been trying to improve dumb HTTP for more cases; actually. (since it's much cheaper than smart HTTP in server memory/CPU) > Because really, if we were *trying* to micro-optimize this for time or > memory use there's much better ways, e.g. reading the old file and > memcmp() as we go and stream the "generate" callback, but I just don't > see the point of trying in this case. I was actually going towards that route; but wasn't sure if this idea would be accepted at all (and I've been trying to stay away from using non-scripting languages). I don't slurping all of info/refs into memory at all; so maybe a streaming memcmp of the existing file is worth doing... > > /* > > * Create the file "path" by writing to a temporary file and renaming > > * it into place. The contents of the file come from "generate", which > > * should return non-zero if it encounters an error. > > */ > > -static int update_info_file(char *path, int (*generate)(FILE *)) > > +static int update_info_file(char *path, int (*generate)(FILE *), int force) > > { > > char *tmp = mkpathdup("%s_XXXXXX", path); > > Unrelated to this, patch, but I hadn't thought about this nasty race > condition. We recommend users run this from the "post-update" (or > "post-receive") hook, and don't juggle the lock along with the ref > update, thus due to the vagaries of scheduling you can end up with two > concurrent ref updates where the "old" one wins. > > But I guess that brings me back to something close to "nobody with that > sort of update rate is using 'dumb http'" :) > > For this to work properly we'd need to take some sort of global "ref > update/pack update" lock, and I guess at that point this "cmp" case > would be a helper similar to commit_lock_file_to(), > i.e. a commit_lock_file_to_if_different(). Worth a separate patch, at some point, I think. I'm not too familiar with the existing locking in git, actually... Along those lines, I think repack/gc should automatically update objects/info/packs if the file already exists. > > int ret = -1; > > int fd = -1; > > FILE *fp = NULL, *to_close; > > + int do_update; > > > > safe_create_leading_directories(path); > > fd = git_mkstemp_mode(tmp, 0666); > > if (fd < 0) > > goto out; > > - to_close = fp = fdopen(fd, "w"); > > + to_close = fp = fdopen(fd, "w+"); > > if (!fp) > > goto out; > > fd = -1; > > ret = generate(fp); > > if (ret) > > goto out; > > + > > + do_update = force || files_differ(fp, path); > > [...] > > > > -static int update_info_refs(void) > > +static int update_info_refs(int force) > > So, I was going to say "shouldn't we update the docs?" which for --force > say "Update the info files from scratch.". > > But reading through it that "from scratch" wording is from c743e6e3c0 > ("Add a link from update-server-info documentation to repository > layout.", 2005-09-04). Yes, that wording is awkward and I can update it. But maybe making it undocumented is sufficient and would save us the trouble of describing it :) "--force" might be seen as a performance optimization for cases where you're certain the result will differ, but I'm not sure if that's worth mentioning in the manpage. > There it was a refrence to a bug since fixed in 60d0526aaa ("Unoptimize > info/refs creation.", 2005-09-14), and then removed from the docs in > c5fe5b6de9 ("Remove obsolete bug warning in man git-update-server-info", > 2009-04-25). > > Then in b3223761c8 ("update_info_refs(): drop unused force parameter", > 2019-04-05) Jeff removed the unused-for-a-decade "force" param. > > So having gone through the history I think we're better off just > dropping the --force argument entirely, or at least changing the > docs. I can update the docs, or make it undocumented. Compatibility from the command-line needs to remain in case there are scripts using it. ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] update-server-info: avoid needless overwrites 2019-05-11 23:37 ` Ævar Arnfjörð Bjarmason 2019-05-12 0:38 ` Eric Wong @ 2019-05-12 4:08 ` Jeff King 2019-05-12 7:16 ` Ævar Arnfjörð Bjarmason 1 sibling, 1 reply; 26+ messages in thread From: Jeff King @ 2019-05-12 4:08 UTC (permalink / raw) To: Ævar Arnfjörð Bjarmason; +Cc: Eric Wong, git, Junio C Hamano On Sun, May 12, 2019 at 01:37:55AM +0200, Ævar Arnfjörð Bjarmason wrote: > This way of doing it just seems so weirdly convoluted. Read them one at > a time, compute the SHA-1, just to see if they're different. Why not > something closer to a plain memcmp(): FWIW, I had the exact same thought on reading the patch. Checking sizes seems like an easy optimization and I don't mind it, but computing hashes when we could just compare the bytes seems pointless. I'd have expected to stream 4k blocks as we go. > I.e. optimze for code simplicity with something close to a dumb "cmp > --silent". I'm going to make the bold claim that nobody using "dumb > http" is going to be impacted by the performance of reading their > for-each-ref or for-each-pack dump, hence opting for not even bothing to > stat() to get the size before reading. You're probably right (especially because we'd just spent O(n) work generating the candidate file). But note that I have seen pathological cases where info/refs was gigabytes. > > /* > > * Create the file "path" by writing to a temporary file and renaming > > * it into place. The contents of the file come from "generate", which > > * should return non-zero if it encounters an error. > > */ > > -static int update_info_file(char *path, int (*generate)(FILE *)) > > +static int update_info_file(char *path, int (*generate)(FILE *), int force) > > { > > char *tmp = mkpathdup("%s_XXXXXX", path); > > Unrelated to this, patch, but I hadn't thought about this nasty race > condition. We recommend users run this from the "post-update" (or > "post-receive") hook, and don't juggle the lock along with the ref > update, thus due to the vagaries of scheduling you can end up with two > concurrent ref updates where the "old" one wins. > > But I guess that brings me back to something close to "nobody with that > sort of update rate is using 'dumb http'" :) > > For this to work properly we'd need to take some sort of global "ref > update/pack update" lock, and I guess at that point this "cmp" case > would be a helper similar to commit_lock_file_to(), > i.e. a commit_lock_file_to_if_different(). I don't think our usual dot-locking is great here. What does the race-loser do when it can't take the lock? It can't just skip its update, since it needs to make sure that its new pack is mentioned. So it has to wait and poll until the other process finishes. I guess maybe that isn't the end of the world. > Then in b3223761c8 ("update_info_refs(): drop unused force parameter", > 2019-04-05) Jeff removed the unused-for-a-decade "force" param. > > So having gone through the history I think we're better off just > dropping the --force argument entirely, or at least changing the > docs. > > Before this change the only thing it was doing was pruning stuff we > haven't written since 2005-ish (see 3e15c67c90 ("server-info: throw away > T computation as well.", 2005-12-04)), rather than "detect if useless" > we should just write out the file again, and then skip if changed > (i.e. this logic). Yeah, my commit only ripped out the useless force parameter for info/refs. For info/packs, there's still that weird "is is stale" computation (which I fixed several bugs in). It's not entirely clear to me if that can just go away, but I agree that if we can it's simpler and more desirable to just generate the candidate result and see if it's bit-for-bit identical or not. I'm not entirely sure of all of the magic that "stale" check is trying to accomplish. I think there's some bits in there that try to preserve the existing ordering, but I don't know why anyone would care (and there are so many cases where the ordering gets thrown out that I think anybody who does care is likely to get disappointed). -Peff ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] update-server-info: avoid needless overwrites 2019-05-12 4:08 ` Jeff King @ 2019-05-12 7:16 ` Ævar Arnfjörð Bjarmason 2019-05-14 9:47 ` Jeff King 0 siblings, 1 reply; 26+ messages in thread From: Ævar Arnfjörð Bjarmason @ 2019-05-12 7:16 UTC (permalink / raw) To: Jeff King; +Cc: Eric Wong, git, Junio C Hamano On Sun, May 12 2019, Jeff King wrote: > On Sun, May 12, 2019 at 01:37:55AM +0200, Ævar Arnfjörð Bjarmason wrote: > >> This way of doing it just seems so weirdly convoluted. Read them one at >> a time, compute the SHA-1, just to see if they're different. Why not >> something closer to a plain memcmp(): > > FWIW, I had the exact same thought on reading the patch. Checking sizes > seems like an easy optimization and I don't mind it, but computing > hashes when we could just compare the bytes seems pointless. I'd have > expected to stream 4k blocks as we go. > >> I.e. optimze for code simplicity with something close to a dumb "cmp >> --silent". I'm going to make the bold claim that nobody using "dumb >> http" is going to be impacted by the performance of reading their >> for-each-ref or for-each-pack dump, hence opting for not even bothing to >> stat() to get the size before reading. > > You're probably right (especially because we'd just spent O(n) work > generating the candidate file). But note that I have seen pathological > cases where info/refs was gigabytes. Wouldn't those users be calling update-server-info from something like a post-receive hook where they *know* the refs/packs just got updated? Well, there is "transfer.unpackLimit" to contend with, but that's just for "packs are same", not "refs are same", and that file's presumably much smaller. So I'd think that *if* this is a case where there's even a question of the performance mattering here, and I'd assume "refs changed but size is the same" is a common case (e.g. craploads or tags, but just "master" pushed-to) we could make this an opt-in "--update-if-changed", because the "if changed" is extra work, and we're already documenting that you should be calling this on the "I know it changed" hook path. I don't care even if we use the initial SHA-1 method. It just struck me as odd to conflate the reasonable "don't update mtime for optimizing dumb over-the-wire client" with "optimize resource use when updating". >> > /* >> > * Create the file "path" by writing to a temporary file and renaming >> > * it into place. The contents of the file come from "generate", which >> > * should return non-zero if it encounters an error. >> > */ >> > -static int update_info_file(char *path, int (*generate)(FILE *)) >> > +static int update_info_file(char *path, int (*generate)(FILE *), int force) >> > { >> > char *tmp = mkpathdup("%s_XXXXXX", path); >> >> Unrelated to this, patch, but I hadn't thought about this nasty race >> condition. We recommend users run this from the "post-update" (or >> "post-receive") hook, and don't juggle the lock along with the ref >> update, thus due to the vagaries of scheduling you can end up with two >> concurrent ref updates where the "old" one wins. >> >> But I guess that brings me back to something close to "nobody with that >> sort of update rate is using 'dumb http'" :) >> >> For this to work properly we'd need to take some sort of global "ref >> update/pack update" lock, and I guess at that point this "cmp" case >> would be a helper similar to commit_lock_file_to(), >> i.e. a commit_lock_file_to_if_different(). > > I don't think our usual dot-locking is great here. What does the > race-loser do when it can't take the lock? It can't just skip its > update, since it needs to make sure that its new pack is mentioned. Right, I mean a *global* .git/I_AM_DOING_WRITES.lock, because there's no way to square the ref backend's notion of per-ref ref lock enabling concurrent pushes with update-server-info's desire to generate metadata showing *all* the refs. > So it has to wait and poll until the other process finishes. I guess > maybe that isn't the end of the world. If "its" is update-server-info this won't work. It's possible for two update-server-info processes to be racing in such a way that their for_each_ref() reads a ref at a given value that'll be updated 1 millisecond later, but to then have that update's update-server-info "win" the race to update the info files (hypothetical locks on those info files and all). Thus the "info" files will be updated to old values, since we'll see we need changes, but change things to the old values. All things that *can* be dealt with in some clever ways, but I think just further proof nobody's using this for anything serious :) I.e. the "commit A happened before B" but also "commit B's post-* hooks finished after A" is a thing that happens quite frequently (per my logs). >> Then in b3223761c8 ("update_info_refs(): drop unused force parameter", >> 2019-04-05) Jeff removed the unused-for-a-decade "force" param. >> >> So having gone through the history I think we're better off just >> dropping the --force argument entirely, or at least changing the >> docs. >> >> Before this change the only thing it was doing was pruning stuff we >> haven't written since 2005-ish (see 3e15c67c90 ("server-info: throw away >> T computation as well.", 2005-12-04)), rather than "detect if useless" >> we should just write out the file again, and then skip if changed >> (i.e. this logic). > > Yeah, my commit only ripped out the useless force parameter for > info/refs. For info/packs, there's still that weird "is is stale" > computation (which I fixed several bugs in). It's not entirely clear to > me if that can just go away, but I agree that if we can it's simpler and > more desirable to just generate the candidate result and see if it's > bit-for-bit identical or not. > > I'm not entirely sure of all of the magic that "stale" check is trying > to accomplish. I think there's some bits in there that try to preserve > the existing ordering, but I don't know why anyone would care (and there > are so many cases where the ordering gets thrown out that I think > anybody who does care is likely to get disappointed). My reading of it is that it's premature optimization that can go away (and most of it has already), for "it's cheap" and "if not it's called from the 'I know I had an update'" hook case, as noted above.< ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] update-server-info: avoid needless overwrites 2019-05-12 7:16 ` Ævar Arnfjörð Bjarmason @ 2019-05-14 9:47 ` Jeff King 2019-05-14 10:33 ` Ævar Arnfjörð Bjarmason 2019-05-14 11:50 ` Eric Wong 0 siblings, 2 replies; 26+ messages in thread From: Jeff King @ 2019-05-14 9:47 UTC (permalink / raw) To: Ævar Arnfjörð Bjarmason; +Cc: Eric Wong, git, Junio C Hamano On Sun, May 12, 2019 at 09:16:31AM +0200, Ævar Arnfjörð Bjarmason wrote: > > You're probably right (especially because we'd just spent O(n) work > > generating the candidate file). But note that I have seen pathological > > cases where info/refs was gigabytes. > > Wouldn't those users be calling update-server-info from something like a > post-receive hook where they *know* the refs/packs just got updated? > > Well, there is "transfer.unpackLimit" to contend with, but that's just > for "packs are same", not "refs are same", and that file's presumably > much smaller. Yeah, I think there's sort of an open question here of who is calling update-server-info when nothing got updated. I think the only place we call it automatically is via receive-pack, but I'd guess Eric runs it as part of public-inbox scripts. I agree that this is probably mostly about info/packs. Not every push (or repo update) will create one, but every push _should_ be changing a ref (if it succeeds at all). And I'd guess Eric's interest comes from the use of Git in public-inbox, which is going to make frequent but small updates. So this does seem like a really niche case; it avoids one single HTTP request of a small that should generally be small (unless you're letting your pack list grow really big, but I think there are other issues with that) in a case that we know will generate a bunch of other HTTP traffic (if somebody updated the server info, there was indeed a push, so you'd get a refreshed info/refs and presumably the new loose objects). That said, the logic to preserve the mtime is pretty straightforward, so I don't mind it too much if Eric finds this really improves things for him. > > I don't think our usual dot-locking is great here. What does the > > race-loser do when it can't take the lock? It can't just skip its > > update, since it needs to make sure that its new pack is mentioned. > > Right, I mean a *global* .git/I_AM_DOING_WRITES.lock, because there's no > way to square the ref backend's notion of per-ref ref lock enabling > concurrent pushes with update-server-info's desire to generate metadata > showing *all* the refs. OK. I agree that would work, but it's nasty to delay user-initiated operations for ongoing maintenance (another obvious place to want such a lock is for pruning, which can take quite a while). > > So it has to wait and poll until the other process finishes. I guess > > maybe that isn't the end of the world. > > If "its" is update-server-info this won't work. It's possible for two > update-server-info processes to be racing in such a way that their > for_each_ref() reads a ref at a given value that'll be updated 1 > millisecond later, but to then have that update's update-server-info > "win" the race to update the info files (hypothetical locks on those > info files and all). > > Thus the "info" files will be updated to old values, since we'll see we > need changes, but change things to the old values. > > All things that *can* be dealt with in some clever ways, but I think > just further proof nobody's using this for anything serious :) > > I.e. the "commit A happened before B" but also "commit B's post-* hooks > finished after A" is a thing that happens quite frequently (per my > logs). I think it would work because any update-server-info, whether from A or B, will take into account the full current repo state (and we don't look at that state until we take the lock). So you might get an interleaved "A-push, B-push, B-maint, A-maint", but that's OK. A-maint will represent B's state when it runs. > > I'm not entirely sure of all of the magic that "stale" check is trying > > to accomplish. I think there's some bits in there that try to preserve > > the existing ordering, but I don't know why anyone would care (and there > > are so many cases where the ordering gets thrown out that I think > > anybody who does care is likely to get disappointed). > > My reading of it is that it's premature optimization that can go away > (and most of it has already), for "it's cheap" and "if not it's called > from the 'I know I had an update'" hook case, as noted above.< That's my reading, too, but I didn't want to be responsible for regressing some obscure case. At least Eric seems to _use_ update-server-info. ;) -Peff ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] update-server-info: avoid needless overwrites 2019-05-14 9:47 ` Jeff King @ 2019-05-14 10:33 ` Ævar Arnfjörð Bjarmason 2019-05-14 11:24 ` Jeff King 2019-05-14 11:50 ` Eric Wong 1 sibling, 1 reply; 26+ messages in thread From: Ævar Arnfjörð Bjarmason @ 2019-05-14 10:33 UTC (permalink / raw) To: Jeff King; +Cc: Eric Wong, git, Junio C Hamano On Tue, May 14 2019, Jeff King wrote: > On Sun, May 12, 2019 at 09:16:31AM +0200, Ævar Arnfjörð Bjarmason wrote: > >> > You're probably right (especially because we'd just spent O(n) work >> > generating the candidate file). But note that I have seen pathological >> > cases where info/refs was gigabytes. >> >> Wouldn't those users be calling update-server-info from something like a >> post-receive hook where they *know* the refs/packs just got updated? >> >> Well, there is "transfer.unpackLimit" to contend with, but that's just >> for "packs are same", not "refs are same", and that file's presumably >> much smaller. > > Yeah, I think there's sort of an open question here of who is calling > update-server-info when nothing got updated. I think the only place we > call it automatically is via receive-pack, but I'd guess Eric runs it as > part of public-inbox scripts. > > I agree that this is probably mostly about info/packs. Not every push > (or repo update) will create one, but every push _should_ be changing a > ref (if it succeeds at all). And I'd guess Eric's interest comes from > the use of Git in public-inbox, which is going to make frequent but > small updates. > > So this does seem like a really niche case; it avoids one single HTTP > request of a small that should generally be small (unless you're letting > your pack list grow really big, but I think there are other issues with > that) in a case that we know will generate a bunch of other HTTP traffic > (if somebody updated the server info, there was indeed a push, so you'd > get a refreshed info/refs and presumably the new loose objects). > > That said, the logic to preserve the mtime is pretty straightforward, so > I don't mind it too much if Eric finds this really improves things for > him. > >> > I don't think our usual dot-locking is great here. What does the >> > race-loser do when it can't take the lock? It can't just skip its >> > update, since it needs to make sure that its new pack is mentioned. >> >> Right, I mean a *global* .git/I_AM_DOING_WRITES.lock, because there's no >> way to square the ref backend's notion of per-ref ref lock enabling >> concurrent pushes with update-server-info's desire to generate metadata >> showing *all* the refs. > > OK. I agree that would work, but it's nasty to delay user-initiated > operations for ongoing maintenance (another obvious place to want such a > lock is for pruning, which can take quite a while). > >> > So it has to wait and poll until the other process finishes. I guess >> > maybe that isn't the end of the world. >> >> If "its" is update-server-info this won't work. It's possible for two >> update-server-info processes to be racing in such a way that their >> for_each_ref() reads a ref at a given value that'll be updated 1 >> millisecond later, but to then have that update's update-server-info >> "win" the race to update the info files (hypothetical locks on those >> info files and all). >> >> Thus the "info" files will be updated to old values, since we'll see we >> need changes, but change things to the old values. >> >> All things that *can* be dealt with in some clever ways, but I think >> just further proof nobody's using this for anything serious :) >> >> I.e. the "commit A happened before B" but also "commit B's post-* hooks >> finished after A" is a thing that happens quite frequently (per my >> logs). > > I think it would work because any update-server-info, whether from A or > B, will take into account the full current repo state (and we don't look > at that state until we take the lock). So you might get an interleaved > "A-push, B-push, B-maint, A-maint", but that's OK. A-maint will > represent B's state when it runs. Maybe we're talking about different things. I mean the following sequence: 1. Refs "X" and "Y" are at X=A Y=A 2. Concurrent push #1 happens, updating X from A..F 3. Concurrent push #2 happens, updating Y from A..F 4. Concurrent push #1 succeeds 5. Concurrent push #1 starts update-server-info. Reads X=F Y=A 5. Concurrent push #2 succeeds 6. Concurrent push #2 starts update-server-info. Reads X=F Y=F 7. Concurrent push #2's update-server-info finishes, X=F Y=F written to "info" 8. Concurrent push #1's update-server-info finishes, X=A Y=F written to "info" I.e. because we have per-ref locks and no lock at all on update-server-info (but that would need to be a global ref lock, not just on the "info" files) we can have a push that's already read "X"'s value as "A" while updating "Y" win the race against an update-server-info that updated "X"'s value to "F". It will get fixed on the next push (at least as far as "X"'s value goes), but until that time dumb clients will falsely see that "X" hasn't been updated. >> > I'm not entirely sure of all of the magic that "stale" check is trying >> > to accomplish. I think there's some bits in there that try to preserve >> > the existing ordering, but I don't know why anyone would care (and there >> > are so many cases where the ordering gets thrown out that I think >> > anybody who does care is likely to get disappointed). >> >> My reading of it is that it's premature optimization that can go away >> (and most of it has already), for "it's cheap" and "if not it's called >> from the 'I know I had an update'" hook case, as noted above.< > > That's my reading, too, but I didn't want to be responsible for > regressing some obscure case. At least Eric seems to _use_ > update-server-info. ;) Yeah I agree with that sentiment. I don't use it. I was just wondering if for those who still want it this wasn't a relatively obscure use-case, so it should perhaps be hidden behind an option if there were optimization concerns. ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] update-server-info: avoid needless overwrites 2019-05-14 10:33 ` Ævar Arnfjörð Bjarmason @ 2019-05-14 11:24 ` Jeff King 2019-05-14 11:57 ` Ævar Arnfjörð Bjarmason 0 siblings, 1 reply; 26+ messages in thread From: Jeff King @ 2019-05-14 11:24 UTC (permalink / raw) To: Ævar Arnfjörð Bjarmason; +Cc: Eric Wong, git, Junio C Hamano On Tue, May 14, 2019 at 12:33:11PM +0200, Ævar Arnfjörð Bjarmason wrote: > > I think it would work because any update-server-info, whether from A or > > B, will take into account the full current repo state (and we don't look > > at that state until we take the lock). So you might get an interleaved > > "A-push, B-push, B-maint, A-maint", but that's OK. A-maint will > > represent B's state when it runs. > > Maybe we're talking about different things. I mean the following > sequence: > > 1. Refs "X" and "Y" are at X=A Y=A > 2. Concurrent push #1 happens, updating X from A..F > 3. Concurrent push #2 happens, updating Y from A..F > 4. Concurrent push #1 succeeds > 5. Concurrent push #1 starts update-server-info. Reads X=F Y=A > 5. Concurrent push #2 succeeds > 6. Concurrent push #2 starts update-server-info. Reads X=F Y=F > 7. Concurrent push #2's update-server-info finishes, X=F Y=F written to "info" > 8. Concurrent push #1's update-server-info finishes, X=A Y=F written to "info" > > I.e. because we have per-ref locks and no lock at all on > update-server-info (but that would need to be a global ref lock, not > just on the "info" files) we can have a push that's already read "X"'s > value as "A" while updating "Y" win the race against an > update-server-info that updated "X"'s value to "F". > > It will get fixed on the next push (at least as far as "X"'s value > goes), but until that time dumb clients will falsely see that "X" hasn't > been updated. That's the same situation. But I thought we were talking about having an update-server-info lock. In which case the #2 update-server-info or the #1 update-server-info runs in its entirety, and cannot have their read and write steps interleaved (that's what I meant by "don't look at the state until we take the lock"). Then that gives us a strict ordering: we know that _some_ update-server-info (be it #1 or #2's) will run after any given update. -Peff ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] update-server-info: avoid needless overwrites 2019-05-14 11:24 ` Jeff King @ 2019-05-14 11:57 ` Ævar Arnfjörð Bjarmason 0 siblings, 0 replies; 26+ messages in thread From: Ævar Arnfjörð Bjarmason @ 2019-05-14 11:57 UTC (permalink / raw) To: Jeff King; +Cc: Eric Wong, git, Junio C Hamano On Tue, May 14 2019, Jeff King wrote: > On Tue, May 14, 2019 at 12:33:11PM +0200, Ævar Arnfjörð Bjarmason wrote: > >> > I think it would work because any update-server-info, whether from A or >> > B, will take into account the full current repo state (and we don't look >> > at that state until we take the lock). So you might get an interleaved >> > "A-push, B-push, B-maint, A-maint", but that's OK. A-maint will >> > represent B's state when it runs. >> >> Maybe we're talking about different things. I mean the following >> sequence: >> >> 1. Refs "X" and "Y" are at X=A Y=A >> 2. Concurrent push #1 happens, updating X from A..F >> 3. Concurrent push #2 happens, updating Y from A..F >> 4. Concurrent push #1 succeeds >> 5. Concurrent push #1 starts update-server-info. Reads X=F Y=A >> 5. Concurrent push #2 succeeds >> 6. Concurrent push #2 starts update-server-info. Reads X=F Y=F >> 7. Concurrent push #2's update-server-info finishes, X=F Y=F written to "info" >> 8. Concurrent push #1's update-server-info finishes, X=A Y=F written to "info" >> >> I.e. because we have per-ref locks and no lock at all on >> update-server-info (but that would need to be a global ref lock, not >> just on the "info" files) we can have a push that's already read "X"'s >> value as "A" while updating "Y" win the race against an >> update-server-info that updated "X"'s value to "F". >> >> It will get fixed on the next push (at least as far as "X"'s value >> goes), but until that time dumb clients will falsely see that "X" hasn't >> been updated. > > That's the same situation. But I thought we were talking about having an > update-server-info lock. In which case the #2 update-server-info or the > #1 update-server-info runs in its entirety, and cannot have their read > and write steps interleaved (that's what I meant by "don't look at the > state until we take the lock"). Then that gives us a strict ordering: we > know that _some_ update-server-info (be it #1 or #2's) will run after > any given update. Yeah you're right. I *thought* in my last E-mail we were talking about the current state, but re-reading upthread I see that was a fail on my part. An update-server-info lock would solve this indeed. We could still end up with a situation where whatever a naïve version of the lockfile API would fail for the "new" update since the old one was underway, so we'd need something similar to core.*Ref*Timeout, but if we ran into a *.lock or the timeout we could exit non-zero, as opposed to silently failing like it does now when it races. ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] update-server-info: avoid needless overwrites 2019-05-14 9:47 ` Jeff King 2019-05-14 10:33 ` Ævar Arnfjörð Bjarmason @ 2019-05-14 11:50 ` Eric Wong 2019-05-14 12:13 ` dumb HTTP things I want to do Eric Wong 2019-05-14 12:19 ` [PATCH] update-server-info: avoid needless overwrites Ævar Arnfjörð Bjarmason 1 sibling, 2 replies; 26+ messages in thread From: Eric Wong @ 2019-05-14 11:50 UTC (permalink / raw) To: Jeff King; +Cc: Ævar Arnfjörð Bjarmason, git, Junio C Hamano Jeff King <peff@peff.net> wrote: > Yeah, I think there's sort of an open question here of who is calling > update-server-info when nothing got updated. I think the only place we > call it automatically is via receive-pack, but I'd guess Eric runs it as > part of public-inbox scripts. Correct. post-update doesn't get run because public-inbox relies on fast-import. I have mirrors using "git fetch", which also doesn't call post-update, either so I was calling update-server-info in my mirror script. Since more people have taken an interest in mirroring things, I figured I'd make "public-inbox-index" (the script which updates the Xapian and SQLite indices) call update-server-info itself. That way, it's simpler to mirror (v1) inboxes: git fetch && git update-server-info && public-inbox-index becomes: git fetch && public-inbox-index That's a huge savings in cognitive overhead. So, my eventual goal for this is we get to the point where any git operation which changes refs will automatically update info/refs if it exists. Ditto for objects/info/packs on any pack changes. This, like my bitmap-by-default change is among the things I'm trying to make it easier for more users to start self-hosting (including mirroring) any type of git repo. Anyways, I am far from knowledgeable about the locking discussion for git, though :x > That's my reading, too, but I didn't want to be responsible for > regressing some obscure case. At least Eric seems to _use_ > update-server-info. ;) I also have something else on my mind for abusing info files with :> (another email) ^ permalink raw reply [flat|nested] 26+ messages in thread
* dumb HTTP things I want to do 2019-05-14 11:50 ` Eric Wong @ 2019-05-14 12:13 ` Eric Wong 2019-05-14 12:27 ` Jeff King 2019-05-14 12:19 ` [PATCH] update-server-info: avoid needless overwrites Ævar Arnfjörð Bjarmason 1 sibling, 1 reply; 26+ messages in thread From: Eric Wong @ 2019-05-14 12:13 UTC (permalink / raw) To: Jeff King; +Cc: Ævar Arnfjörð Bjarmason, git, Junio C Hamano Eric Wong <e@80x24.org> wrote: > Jeff King <peff@peff.net> wrote: > > That's my reading, too, but I didn't want to be responsible for > > regressing some obscure case. At least Eric seems to _use_ > > update-server-info. ;) > > I also have something else on my mind for abusing info files with :> > (another email) I'm not sure when/if I'll have time for this; but this ought to be possible: GIT_DIR=$HTTP_URL git <any read-only command> And possible without existing admins to setup or change anything on their server. Right now, I could do it by setting up a WebDAV server and using fusedav[1] on the client. But, not everybody runs a WebDAV server which allows PROPFIND for listing files... However, info/refs and objects/info/packs can give us all the info we need without needing PROPFIND. All we'd need is the common GET/HEAD HTTP methods for read-only access. git doesn't need mmap; and curl + Range requests ought to be able to get us what we need to emulate pread. It'd be great for low-latency LANs, maybe not so great with high latency; but probably better in many cases than cloning a giant repo to cat one blob. Also, cloning on a static bundle ought to be doable with: git clone $REMOTE_OR_LOCAL_PATH/foo.bundle And yeah, it also sucks that bundles double storage overhead for admins; it would be nice if I could use bundles as alternates or packs... Anyways, all of this is probably a lot of work and I don't hack much, anymore. [1] I have many patches for fusedav, and the debian maintainer seems dead, and upstream's moved on: https://bugs.debian.org/fusedav davfs2 can't do Range requests, so it won't work for big repos... ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: dumb HTTP things I want to do 2019-05-14 12:13 ` dumb HTTP things I want to do Eric Wong @ 2019-05-14 12:27 ` Jeff King 0 siblings, 0 replies; 26+ messages in thread From: Jeff King @ 2019-05-14 12:27 UTC (permalink / raw) To: Eric Wong; +Cc: Ævar Arnfjörð Bjarmason, git, Junio C Hamano On Tue, May 14, 2019 at 12:13:50PM +0000, Eric Wong wrote: > I'm not sure when/if I'll have time for this; but this ought to > be possible: > > GIT_DIR=$HTTP_URL git <any read-only command> > > And possible without existing admins to setup or change > anything on their server. > [...] > git doesn't need mmap; and curl + Range requests ought to be > able to get us what we need to emulate pread. It'd be great for > low-latency LANs, maybe not so great with high latency; but > probably better in many cases than cloning a giant repo to cat > one blob. My first thought here is that we could probably just use the filesystem boundary as the appropriate layer, and have an httpfs fuse driver. Your mention of fusedav makes me think you've already gone this route. I guess non-dav http doesn't necessarily let us enumerate directories. But it might be enough if we could insert ourselves in a few key spots. E.g., when we try to opendir("objects/packs") and it fails with ENOSYS or similar, then we fallback to opening "objects/info/packs" to get the same information (and ditto for loose ref traversal). There'd still be _some_ work in Git, but it wouldn't require replacing every single read with HTTP magic. > Also, cloning on a static bundle ought to be doable with: > > git clone $REMOTE_OR_LOCAL_PATH/foo.bundle Some nearby discussion: https://public-inbox.org/git/20190514092900.GA11679@sigill.intra.peff.net/ > And yeah, it also sucks that bundles double storage overhead > for admins; it would be nice if I could use bundles as alternates > or packs... Yes. It's nice that they're a single file for some uses, but in terms of implementation it would be easier if the bundle files just said "here are some refs, and you can find my packfile in the relative filename XYZ.pack". And then you'd just store the two of them together (and a matching .idx if you wanted to use it as a real pack, but the bundle readers wouldn't care). It probably wouldn't be that hard to wedge that into the bundle format by bumping the version field. -Peff ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] update-server-info: avoid needless overwrites 2019-05-14 11:50 ` Eric Wong 2019-05-14 12:13 ` dumb HTTP things I want to do Eric Wong @ 2019-05-14 12:19 ` Ævar Arnfjörð Bjarmason 2019-05-14 12:29 ` Jeff King 2019-05-15 0:45 ` [PATCH 2/1] server-info: conditionally update on fetch Eric Wong 1 sibling, 2 replies; 26+ messages in thread From: Ævar Arnfjörð Bjarmason @ 2019-05-14 12:19 UTC (permalink / raw) To: Eric Wong; +Cc: Jeff King, git, Junio C Hamano On Tue, May 14 2019, Eric Wong wrote: > Jeff King <peff@peff.net> wrote: >> Yeah, I think there's sort of an open question here of who is calling >> update-server-info when nothing got updated. I think the only place we >> call it automatically is via receive-pack, but I'd guess Eric runs it as >> part of public-inbox scripts. > > Correct. post-update doesn't get run because public-inbox > relies on fast-import. I have mirrors using "git fetch", which > also doesn't call post-update, either so I was calling > update-server-info in my mirror script. > > Since more people have taken an interest in mirroring things, > I figured I'd make "public-inbox-index" (the script which > updates the Xapian and SQLite indices) call update-server-info > itself. > > That way, it's simpler to mirror (v1) inboxes: > > git fetch && git update-server-info && public-inbox-index > > becomes: > > git fetch && public-inbox-index > > That's a huge savings in cognitive overhead. > > So, my eventual goal for this is we get to the point where any > git operation which changes refs will automatically update > info/refs if it exists. > > Ditto for objects/info/packs on any pack changes. > > This, like my bitmap-by-default change is among the things > I'm trying to make it easier for more users to start > self-hosting (including mirroring) any type of git repo. > > Anyways, I am far from knowledgeable about the locking > discussion for git, though :x > >> That's my reading, too, but I didn't want to be responsible for >> regressing some obscure case. At least Eric seems to _use_ >> update-server-info. ;) > > I also have something else on my mind for abusing info files with :> > (another email) Aside from this change, I wonder if making "fetch" optionally "exit 1" if no refs were updated would be useful, as in the below WIP. Of course it would be better to distinguish errors from "no refs to update". I've hacked around it not doing that in the past for similar "fetch and index" things by looking at for-each-ref before/after, or only seeing if HEAD changed (so this wouldn't be a general solution...): diff --git a/builtin/fetch.c b/builtin/fetch.c index 4ba63d5ac6..da5414d9db 100644 --- a/builtin/fetch.c +++ b/builtin/fetch.c @@ -47,6 +47,7 @@ static int prune_tags = -1; /* unspecified */ #define PRUNE_TAGS_BY_DEFAULT 0 /* do we prune tags by default? */ static int all, append, dry_run, force, keep, multiple, update_head_ok, verbosity, deepen_relative; +static int exit_code; static int progress = -1; static int tags = TAGS_DEFAULT, unshallow, update_shallow, deepen; static int max_children = 1; @@ -66,6 +67,7 @@ static struct refspec refmap = REFSPEC_INIT_FETCH; static struct list_objects_filter_options filter_options; static struct string_list server_options = STRING_LIST_INIT_DUP; static struct string_list negotiation_tip = STRING_LIST_INIT_NODUP; +static int updated_refs; static int git_fetch_config(const char *k, const char *v, void *cb) { @@ -133,6 +135,8 @@ static struct option builtin_fetch_options[] = { { OPTION_CALLBACK, 0, "recurse-submodules", &recurse_submodules, N_("on-demand"), N_("control recursive fetching of submodules"), PARSE_OPT_OPTARG, option_fetch_parse_recurse_submodules }, + OPT_BOOL(0, "exit-code", &exit_code, + N_("exit successfully if refs are updated")), OPT_BOOL(0, "dry-run", &dry_run, N_("dry run")), OPT_BOOL('k', "keep", &keep, N_("keep downloaded pack")), @@ -522,8 +526,10 @@ static int s_update_ref(const char *action, struct strbuf err = STRBUF_INIT; int ret, df_conflict = 0; - if (dry_run) + if (dry_run) { + updated_refs++; return 0; + } if (!rla) rla = default_rla.buf; msg = xstrfmt("%s: %s", rla, action); @@ -545,6 +551,7 @@ static int s_update_ref(const char *action, ref_transaction_free(transaction); strbuf_release(&err); free(msg); + updated_refs++; return 0; fail: ref_transaction_free(transaction); @@ -1680,5 +1687,9 @@ int cmd_fetch(int argc, const char **argv, const char *prefix) run_command_v_opt(argv_gc_auto.argv, RUN_GIT_CMD); argv_array_clear(&argv_gc_auto); - return result; + if (result) + return result; + if (exit_code) + return !updated_refs; + return 0; } ^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCH] update-server-info: avoid needless overwrites 2019-05-14 12:19 ` [PATCH] update-server-info: avoid needless overwrites Ævar Arnfjörð Bjarmason @ 2019-05-14 12:29 ` Jeff King 2019-05-15 0:45 ` [PATCH 2/1] server-info: conditionally update on fetch Eric Wong 1 sibling, 0 replies; 26+ messages in thread From: Jeff King @ 2019-05-14 12:29 UTC (permalink / raw) To: Ævar Arnfjörð Bjarmason; +Cc: Eric Wong, git, Junio C Hamano On Tue, May 14, 2019 at 02:19:36PM +0200, Ævar Arnfjörð Bjarmason wrote: > Aside from this change, I wonder if making "fetch" optionally "exit 1" > if no refs were updated would be useful, as in the below WIP. Of course > it would be better to distinguish errors from "no refs to update". That seems very sensible to me, as long as it's an optional flag as you have it here. > @@ -133,6 +135,8 @@ static struct option builtin_fetch_options[] = { > { OPTION_CALLBACK, 0, "recurse-submodules", &recurse_submodules, N_("on-demand"), > N_("control recursive fetching of submodules"), > PARSE_OPT_OPTARG, option_fetch_parse_recurse_submodules }, > + OPT_BOOL(0, "exit-code", &exit_code, > + N_("exit successfully if refs are updated")), I think it makes sense to flip this explanation: "exit non-zero if no refs are updated" or "treat it as an error if no refs are updated". Since the default behavior is already to exit successfully in this case. :) We may also want to advertise a particular result code so callers can distinguish it from regular die() errors. -Peff ^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH 2/1] server-info: conditionally update on fetch 2019-05-14 12:19 ` [PATCH] update-server-info: avoid needless overwrites Ævar Arnfjörð Bjarmason 2019-05-14 12:29 ` Jeff King @ 2019-05-15 0:45 ` Eric Wong 2019-05-15 20:38 ` [WIP] repack leaving stale entries in objects/info/packs Eric Wong 1 sibling, 1 reply; 26+ messages in thread From: Eric Wong @ 2019-05-15 0:45 UTC (permalink / raw) To: Ævar Arnfjörð Bjarmason; +Cc: Jeff King, git, Junio C Hamano Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote: > Aside from this change, I wonder if making "fetch" optionally "exit 1" > if no refs were updated would be useful, as in the below WIP. Of course > it would be better to distinguish errors from "no refs to update". Yes, we should've had this feature all along :) And it's easy for me to build off your WIP to have fetch update server info iff info/refs already exists: -------8<------- Subject: [PATCH 2/1] server-info: conditionally update on fetch Since fetch can invalidate existing server info files, use the new `updated_refs' counter to update server info files iff info/refs already exists. Note: this depends on Ævar's WIP in: https://public-inbox.org/git/87ftphw7mv.fsf@evledraar.gmail.com/ Signed-off-by: Eric Wong <e@80x24.org> --- builtin/fetch.c | 3 +++ server-info.c | 22 +++++++++++++++++++--- t/t5513-fetch-track.sh | 21 +++++++++++++++++++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/builtin/fetch.c b/builtin/fetch.c index da5414d9db..b35d4d105d 100644 --- a/builtin/fetch.c +++ b/builtin/fetch.c @@ -1681,6 +1681,9 @@ int cmd_fetch(int argc, const char **argv, const char *prefix) close_all_packs(the_repository->objects); + if (updated_refs) + update_server_info(-1); + argv_array_pushl(&argv_gc_auto, "gc", "--auto", NULL); if (verbosity < 0) argv_array_push(&argv_gc_auto, "--quiet"); diff --git a/server-info.c b/server-info.c index e68f785c2f..d4065d56a3 100644 --- a/server-info.c +++ b/server-info.c @@ -7,6 +7,7 @@ #include "packfile.h" #include "object-store.h" #include "strbuf.h" +#include "dir.h" struct update_info_ctx { FILE *cur_fp; @@ -170,10 +171,25 @@ static int generate_info_refs(struct update_info_ctx *uic) return for_each_ref(add_info_ref, uic); } -static int update_info_refs(int force) +static int want_update(int *force, const char *path) +{ + if (*force < 0) { + if (file_exists(path)) + *force = 0; /* continue to normal update */ + else + return 0; + } + return 1; +} + +static int update_info_refs(int *force) { char *path = git_pathdup("info/refs"); - int ret = update_info_file(path, generate_info_refs, force); + int ret = 0; + + if (want_update(force, path)) + ret = update_info_file(path, generate_info_refs, *force); + free(path); return ret; } @@ -361,7 +377,7 @@ int update_server_info(int force) */ int errs = 0; - errs = errs | update_info_refs(force); + errs = errs | update_info_refs(&force); errs = errs | update_info_packs(force); /* remove leftover rev-cache file if there is any */ diff --git a/t/t5513-fetch-track.sh b/t/t5513-fetch-track.sh index 65d1e05bd6..421f16ddfd 100755 --- a/t/t5513-fetch-track.sh +++ b/t/t5513-fetch-track.sh @@ -27,4 +27,25 @@ test_expect_success fetch ' ) ' +test_expect_success 'info/refs not created by fetch' ' + ( + cd other && + test_path_is_dir .git/info && + ! test_path_is_file .git/info/refs + ) +' + +test_expect_success 'info/refs updated by fetch if it already exists' ' + git branch b/for-info-refs && + ( + cd other && + git update-server-info && + test_path_is_file .git/info/refs && + ! grep b/for-info-refs .git/info/refs && + git fetch && + test_path_is_file .git/info/refs && + grep b/for-info-refs .git/info/refs + ) +' + test_done -- EW ^ permalink raw reply related [flat|nested] 26+ messages in thread
* [WIP] repack leaving stale entries in objects/info/packs 2019-05-15 0:45 ` [PATCH 2/1] server-info: conditionally update on fetch Eric Wong @ 2019-05-15 20:38 ` Eric Wong 2019-05-15 21:48 ` Jeff King 0 siblings, 1 reply; 26+ messages in thread From: Eric Wong @ 2019-05-15 20:38 UTC (permalink / raw) To: git; +Cc: Jeff King, Junio C Hamano, Ævar Arnfjörð Bjarmason I've also noticed objects/info/packs contains stale entries after repack/gc runs on current git. Tried adding reprepare_packed_git before update_server_info, but that didn't seem to work; so maybe something isn't cleared. Might have time to investigate more this week, might not... diff --git a/builtin/repack.c b/builtin/repack.c index 67f8978043..ec46f2099a 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -559,8 +559,10 @@ int cmd_repack(int argc, const char **argv, const char *prefix) prune_shallow(PRUNE_QUICK); } - if (!no_update_server_info) + if (!no_update_server_info) { + reprepare_packed_git(the_repository); update_server_info(0); + } remove_temporary_files(); if (git_env_bool(GIT_TEST_MULTI_PACK_INDEX, 0)) diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh index 7411bf7fec..867d4fb0c7 100755 --- a/t/t6500-gc.sh +++ b/t/t6500-gc.sh @@ -71,6 +71,7 @@ test_expect_success 'gc --keep-largest-pack' ' git gc --keep-largest-pack && ( cd .git/objects/pack && ls *.pack ) >pack-list && test_line_count = 2 pack-list && + test_line_count = 2 .git/objects/info/packs && test_path_is_file $BASE_PACK && git fsck ) ^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [WIP] repack leaving stale entries in objects/info/packs 2019-05-15 20:38 ` [WIP] repack leaving stale entries in objects/info/packs Eric Wong @ 2019-05-15 21:48 ` Jeff King 2019-05-23 8:59 ` [PATCH] server-info: do not list unlinked packs Eric Wong 0 siblings, 1 reply; 26+ messages in thread From: Jeff King @ 2019-05-15 21:48 UTC (permalink / raw) To: Eric Wong; +Cc: git, Junio C Hamano, Ævar Arnfjörð Bjarmason On Wed, May 15, 2019 at 08:38:39PM +0000, Eric Wong wrote: > I've also noticed objects/info/packs contains stale entries > after repack/gc runs on current git. > > Tried adding reprepare_packed_git before update_server_info, > but that didn't seem to work; so maybe something isn't cleared. > Might have time to investigate more this week, might not... We never delete entries from the in-memory packed_git list; a reprepare only adds to the list. You'd need to teach update_server_info() to ignore packs which are no longer present (or switch to exec-ing a separate update-server-info binary). -Peff ^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH] server-info: do not list unlinked packs 2019-05-15 21:48 ` Jeff King @ 2019-05-23 8:59 ` Eric Wong 2019-05-23 10:24 ` Jeff King 0 siblings, 1 reply; 26+ messages in thread From: Eric Wong @ 2019-05-23 8:59 UTC (permalink / raw) To: Jeff King; +Cc: git, Junio C Hamano, Ævar Arnfjörð Bjarmason Jeff King <peff@peff.net> wrote: > On Wed, May 15, 2019 at 08:38:39PM +0000, Eric Wong wrote: > > > I've also noticed objects/info/packs contains stale entries > > after repack/gc runs on current git. > > > > Tried adding reprepare_packed_git before update_server_info, > > but that didn't seem to work; so maybe something isn't cleared. > > Might have time to investigate more this week, might not... > > We never delete entries from the in-memory packed_git list; a reprepare > only adds to the list. You'd need to teach update_server_info() to > ignore packs which are no longer present (or switch to exec-ing a > separate update-server-info binary). Ah, checking files_exists() and setting a bit seems sufficient. --------8<--------- Subject: [PATCH] server-info: do not list unlinked packs Having non-existent packs in objects/info/packs causes dumb HTTP clients to abort. There remains a small window where the old objects/info/packs file can refer to unlinked packs. That's unavoidable even on a local FS given the time-of-use-time-of-check window between listing and retrieving files. Signed-off-by: Eric Wong <e@80x24.org> --- I think the small window I refer to can be worked around by teaching the dumb HTTP client to reread objects/info/packs if it 404s while trying to GET a pack... object-store.h | 1 + server-info.c | 7 ++++++- t/t6500-gc.sh | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/object-store.h b/object-store.h index 272e01e452..2c9facc8f2 100644 --- a/object-store.h +++ b/object-store.h @@ -77,6 +77,7 @@ struct packed_git { freshened:1, do_not_close:1, pack_promisor:1, + pack_unlinked:1, multi_pack_index:1; unsigned char hash[GIT_MAX_RAWSZ]; struct revindex_entry *revindex; diff --git a/server-info.c b/server-info.c index 41274d098b..69e2c5279b 100644 --- a/server-info.c +++ b/server-info.c @@ -1,4 +1,5 @@ #include "cache.h" +#include "dir.h" #include "repository.h" #include "refs.h" #include "object.h" @@ -199,12 +200,16 @@ static void init_pack_info(const char *infofile, int force) */ if (!p->pack_local) continue; + if (!file_exists(p->pack_name)) { + p->pack_unlinked = 1; + continue; + } i++; } num_pack = i; info = xcalloc(num_pack, sizeof(struct pack_info *)); for (i = 0, p = get_all_packs(the_repository); p; p = p->next) { - if (!p->pack_local) + if (!p->pack_local || p->pack_unlinked) continue; assert(i < num_pack); info[i] = xcalloc(1, sizeof(struct pack_info)); diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh index 515c6735e9..c0f04dc6b0 100755 --- a/t/t6500-gc.sh +++ b/t/t6500-gc.sh @@ -71,6 +71,8 @@ test_expect_success 'gc --keep-largest-pack' ' git gc --keep-largest-pack && ( cd .git/objects/pack && ls *.pack ) >pack-list && test_line_count = 2 pack-list && + awk "/^P /{print \$2}" <.git/objects/info/packs >pack-info && + test_line_count = 2 pack-info && test_path_is_file $BASE_PACK && git fsck ) base-commit: aa25c82427ae70aebf3b8f970f2afd54e9a2a8c6 -- EW ^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCH] server-info: do not list unlinked packs 2019-05-23 8:59 ` [PATCH] server-info: do not list unlinked packs Eric Wong @ 2019-05-23 10:24 ` Jeff King 2019-05-23 17:27 ` [PATCH v2] " Eric Wong 0 siblings, 1 reply; 26+ messages in thread From: Jeff King @ 2019-05-23 10:24 UTC (permalink / raw) To: Eric Wong; +Cc: git, Junio C Hamano, Ævar Arnfjörð Bjarmason On Thu, May 23, 2019 at 08:59:59AM +0000, Eric Wong wrote: > > We never delete entries from the in-memory packed_git list; a reprepare > > only adds to the list. You'd need to teach update_server_info() to > > ignore packs which are no longer present (or switch to exec-ing a > > separate update-server-info binary). > > Ah, checking files_exists() and setting a bit seems sufficient. Yes, though we do we even need to store the bit? I.e., > @@ -199,12 +200,16 @@ static void init_pack_info(const char *infofile, int force) > */ > if (!p->pack_local) > continue; > + if (!file_exists(p->pack_name)) { > + p->pack_unlinked = 1; > + continue; > + } > i++; > } > num_pack = i; > info = xcalloc(num_pack, sizeof(struct pack_info *)); > for (i = 0, p = get_all_packs(the_repository); p; p = p->next) { > - if (!p->pack_local) > + if (!p->pack_local || p->pack_unlinked) > continue; > assert(i < num_pack); > info[i] = xcalloc(1, sizeof(struct pack_info)); If we just check file_exists() in the second loop, then this is entirely local to update_server_info(). And other users of packed_git do not have to wonder who is responsible for setting that flag in the global list. It does mean you'd over-allocate the array (and num_pack would have to be adjusted down to "i" after the second loop), but that's not a big deal. I do think the whole two-loop thing would be more readable if we simply grew it on the fly with ALLOC_GROW(). -Peff ^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v2] server-info: do not list unlinked packs 2019-05-23 10:24 ` Jeff King @ 2019-05-23 17:27 ` Eric Wong 2019-05-24 6:05 ` Jeff King 2019-05-24 7:34 ` Ævar Arnfjörð Bjarmason 0 siblings, 2 replies; 26+ messages in thread From: Eric Wong @ 2019-05-23 17:27 UTC (permalink / raw) To: Jeff King; +Cc: git, Junio C Hamano, Ævar Arnfjörð Bjarmason Jeff King <peff@peff.net> wrote: > On Thu, May 23, 2019 at 08:59:59AM +0000, Eric Wong wrote: > > > > We never delete entries from the in-memory packed_git list; a reprepare > > > only adds to the list. You'd need to teach update_server_info() to > > > ignore packs which are no longer present (or switch to exec-ing a > > > separate update-server-info binary). > > > > Ah, checking files_exists() and setting a bit seems sufficient. > > Yes, though we do we even need to store the bit? I wanted to avoid the over-allocation, and I hit a bounds error because I forgot to adjust num_pack as you mentioned below. > I.e., > > > @@ -199,12 +200,16 @@ static void init_pack_info(const char *infofile, int force) > > */ > > if (!p->pack_local) > > continue; > > + if (!file_exists(p->pack_name)) { > > + p->pack_unlinked = 1; > > + continue; > > + } > > i++; > > } > > num_pack = i; > > info = xcalloc(num_pack, sizeof(struct pack_info *)); > > for (i = 0, p = get_all_packs(the_repository); p; p = p->next) { > > - if (!p->pack_local) > > + if (!p->pack_local || p->pack_unlinked) > > continue; > > assert(i < num_pack); > > info[i] = xcalloc(1, sizeof(struct pack_info)); > > If we just check file_exists() in the second loop, then this is entirely > local to update_server_info(). And other users of packed_git do not have > to wonder who is responsible for setting that flag in the global list. > > It does mean you'd over-allocate the array (and num_pack would have to > be adjusted down to "i" after the second loop), but that's not a big > deal. I do think the whole two-loop thing would be more readable if we > simply grew it on the fly with ALLOC_GROW(). ALLOC_GROW makes the whole thing much nicer. Thanks for the hint :> ---------------------8<--------------------- Subject: [PATCH] server-info: do not list unlinked packs Having non-existent packs in objects/info/packs causes dumb HTTP clients to abort. v2: use single loop with ALLOC_GROW as suggested by Jeff King Signed-off-by: Eric Wong <e@80x24.org> Helped-by: Jeff King <peff@peff.net> --- Interdiff: diff --git a/object-store.h b/object-store.h index 2c9facc8f2..272e01e452 100644 --- a/object-store.h +++ b/object-store.h @@ -77,7 +77,6 @@ struct packed_git { freshened:1, do_not_close:1, pack_promisor:1, - pack_unlinked:1, multi_pack_index:1; unsigned char hash[GIT_MAX_RAWSZ]; struct revindex_entry *revindex; diff --git a/server-info.c b/server-info.c index 69e2c5279b..92187c70db 100644 --- a/server-info.c +++ b/server-info.c @@ -192,30 +192,21 @@ static void init_pack_info(const char *infofile, int force) { struct packed_git *p; int stale; - int i = 0; + int i; + size_t alloc = 0; for (p = get_all_packs(the_repository); p; p = p->next) { /* we ignore things on alternate path since they are * not available to the pullers in general. */ - if (!p->pack_local) - continue; - if (!file_exists(p->pack_name)) { - p->pack_unlinked = 1; - continue; - } - i++; - } - num_pack = i; - info = xcalloc(num_pack, sizeof(struct pack_info *)); - for (i = 0, p = get_all_packs(the_repository); p; p = p->next) { - if (!p->pack_local || p->pack_unlinked) + if (!p->pack_local || !file_exists(p->pack_name)) continue; - assert(i < num_pack); + + i = num_pack++; + ALLOC_GROW(info, num_pack, alloc); info[i] = xcalloc(1, sizeof(struct pack_info)); info[i]->p = p; info[i]->old_num = -1; - i++; } if (infofile && !force) server-info.c | 18 +++++++----------- t/t6500-gc.sh | 2 ++ 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/server-info.c b/server-info.c index 41274d098b..92187c70db 100644 --- a/server-info.c +++ b/server-info.c @@ -1,4 +1,5 @@ #include "cache.h" +#include "dir.h" #include "repository.h" #include "refs.h" #include "object.h" @@ -191,26 +192,21 @@ static void init_pack_info(const char *infofile, int force) { struct packed_git *p; int stale; - int i = 0; + int i; + size_t alloc = 0; for (p = get_all_packs(the_repository); p; p = p->next) { /* we ignore things on alternate path since they are * not available to the pullers in general. */ - if (!p->pack_local) - continue; - i++; - } - num_pack = i; - info = xcalloc(num_pack, sizeof(struct pack_info *)); - for (i = 0, p = get_all_packs(the_repository); p; p = p->next) { - if (!p->pack_local) + if (!p->pack_local || !file_exists(p->pack_name)) continue; - assert(i < num_pack); + + i = num_pack++; + ALLOC_GROW(info, num_pack, alloc); info[i] = xcalloc(1, sizeof(struct pack_info)); info[i]->p = p; info[i]->old_num = -1; - i++; } if (infofile && !force) diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh index 515c6735e9..c0f04dc6b0 100755 --- a/t/t6500-gc.sh +++ b/t/t6500-gc.sh @@ -71,6 +71,8 @@ test_expect_success 'gc --keep-largest-pack' ' git gc --keep-largest-pack && ( cd .git/objects/pack && ls *.pack ) >pack-list && test_line_count = 2 pack-list && + awk "/^P /{print \$2}" <.git/objects/info/packs >pack-info && + test_line_count = 2 pack-info && test_path_is_file $BASE_PACK && git fsck ) -- EW ^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCH v2] server-info: do not list unlinked packs 2019-05-23 17:27 ` [PATCH v2] " Eric Wong @ 2019-05-24 6:05 ` Jeff King 2019-05-24 7:34 ` Ævar Arnfjörð Bjarmason 1 sibling, 0 replies; 26+ messages in thread From: Jeff King @ 2019-05-24 6:05 UTC (permalink / raw) To: Eric Wong; +Cc: git, Junio C Hamano, Ævar Arnfjörð Bjarmason On Thu, May 23, 2019 at 05:27:23PM +0000, Eric Wong wrote: > ALLOC_GROW makes the whole thing much nicer. > Thanks for the hint :> > > ---------------------8<--------------------- > Subject: [PATCH] server-info: do not list unlinked packs This looks fine to me (though the "v2" note is in your commit message). I'd probably break it into two separate patches, but I can live with it either way. -Peff ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v2] server-info: do not list unlinked packs 2019-05-23 17:27 ` [PATCH v2] " Eric Wong 2019-05-24 6:05 ` Jeff King @ 2019-05-24 7:34 ` Ævar Arnfjörð Bjarmason 1 sibling, 0 replies; 26+ messages in thread From: Ævar Arnfjörð Bjarmason @ 2019-05-24 7:34 UTC (permalink / raw) To: Eric Wong; +Cc: Jeff King, git, Junio C Hamano On Thu, May 23 2019, Eric Wong wrote: > Jeff King <peff@peff.net> wrote: >> On Thu, May 23, 2019 at 08:59:59AM +0000, Eric Wong wrote: >> >> > > We never delete entries from the in-memory packed_git list; a reprepare >> > > only adds to the list. You'd need to teach update_server_info() to >> > > ignore packs which are no longer present (or switch to exec-ing a >> > > separate update-server-info binary). >> > >> > Ah, checking files_exists() and setting a bit seems sufficient. >> >> Yes, though we do we even need to store the bit? > > I wanted to avoid the over-allocation, and I hit a bounds error > because I forgot to adjust num_pack as you mentioned > below. [...]ALLOC_GROW makes the whole thing much nicer. If you want to avoid over-allocation the last thing you want is ALLOC_GROW() :) I.e. see alloc_nr() in cache.h, we explicitly over-allocate with it. But as our extensive use of it shows this sort of pattern is the right trade-off, both in terms of performance on modern hardware, and code readability in cases like this where we're never going to realistically have to worry about memory pressure. ^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v3] update-server-info: avoid needless overwrites 2019-05-11 1:34 [PATCH] update-server-info: avoid needless overwrites Eric Wong ` (2 preceding siblings ...) 2019-05-11 23:37 ` Ævar Arnfjörð Bjarmason @ 2019-05-13 23:17 ` Eric Wong 3 siblings, 0 replies; 26+ messages in thread From: Eric Wong @ 2019-05-13 23:17 UTC (permalink / raw) To: git; +Cc: Eric Sunshine, Ævar Arnfjörð Bjarmason, Jeff King Do not change the existing info/refs and objects/info/packs files if they match the existing content on the filesystem. This is intended to preserve mtime and make it easier for dumb HTTP pollers to rely on the If-Modified-Since header. Combined with stdio and kernel buffering; the kernel should be able to avoid block layer writes and reduce wear for small files. As a result, the --force option is no longer needed. So stop documenting it, but let it remain for compatibility (and debugging, if necessary). v3: perform incremental comparison while generating to avoid OOM with giant files. Remove documentation for --force. Signed-off-by: Eric Wong <e@80x24.org> --- OK, performing the incremental comparison wasn't nearly as bad as thought it'd be. I might be suffering from compiler-linker-slowness PTSD :x Interdiff: diff --git a/Documentation/git-update-server-info.txt b/Documentation/git-update-server-info.txt index bd0e36492f..969bb2e15f 100644 --- a/Documentation/git-update-server-info.txt +++ b/Documentation/git-update-server-info.txt @@ -9,7 +9,7 @@ git-update-server-info - Update auxiliary info file to help dumb servers SYNOPSIS -------- [verse] -'git update-server-info' [--force] +'git update-server-info' DESCRIPTION ----------- @@ -19,15 +19,6 @@ $GIT_OBJECT_DIRECTORY/info directories to help clients discover what references and packs the server has. This command generates such auxiliary files. - -OPTIONS -------- - --f:: ---force:: - Update the info files from scratch. - - OUTPUT ------ diff --git a/server-info.c b/server-info.c index 11515804d4..e68f785c2f 100644 --- a/server-info.c +++ b/server-info.c @@ -8,41 +8,54 @@ #include "object-store.h" #include "strbuf.h" -static int files_differ(FILE *fp, const char *path) +struct update_info_ctx { + FILE *cur_fp; + FILE *old_fp; /* becomes NULL if it differs from cur_fp */ + struct strbuf cur_sb; + struct strbuf old_sb; +}; + +static void uic_mark_stale(struct update_info_ctx *uic) { - struct stat st; - git_hash_ctx c; - struct object_id oid_old, oid_new; - struct strbuf tmp = STRBUF_INIT; - long new_len = ftell(fp); + fclose(uic->old_fp); + uic->old_fp = NULL; +} - if (new_len < 0 || stat(path, &st) < 0) - return 1; - if (!S_ISREG(st.st_mode)) - return 1; - if ((off_t)new_len != st.st_size) - return 1; +static int uic_is_stale(const struct update_info_ctx *uic) +{ + return uic->old_fp == NULL; +} - rewind(fp); - if (strbuf_fread(&tmp, (size_t)new_len, fp) != (size_t)new_len) { - strbuf_release(&tmp); - return 1; - } - the_hash_algo->init_fn(&c); - the_hash_algo->update_fn(&c, tmp.buf, tmp.len); - the_hash_algo->final_fn(oid_new.hash, &c); - strbuf_release(&tmp); +static int uic_printf(struct update_info_ctx *uic, const char *fmt, ...) +{ + va_list ap; + int ret = -1; - if (strbuf_read_file(&tmp, path, (size_t)st.st_size) < 0) { - strbuf_release(&tmp); - return 1; + va_start(ap, fmt); + + if (uic_is_stale(uic)) { + ret = vfprintf(uic->cur_fp, fmt, ap); + } else { + ssize_t r; + struct strbuf *cur = &uic->cur_sb; + struct strbuf *old = &uic->old_sb; + + strbuf_reset(cur); + strbuf_vinsertf(cur, 0, fmt, ap); + + strbuf_reset(old); + strbuf_grow(old, cur->len); + r = fread(old->buf, 1, cur->len, uic->old_fp); + if (r != cur->len || memcmp(old->buf, cur->buf, r)) + uic_mark_stale(uic); + + if (fwrite(cur->buf, 1, cur->len, uic->cur_fp) == cur->len) + ret = 0; } - the_hash_algo->init_fn(&c); - the_hash_algo->update_fn(&c, tmp.buf, tmp.len); - the_hash_algo->final_fn(oid_old.hash, &c); - strbuf_release(&tmp); - return hashcmp(oid_old.hash, oid_new.hash); + va_end(ap); + + return ret; } /* @@ -50,31 +63,61 @@ static int files_differ(FILE *fp, const char *path) * it into place. The contents of the file come from "generate", which * should return non-zero if it encounters an error. */ -static int update_info_file(char *path, int (*generate)(FILE *), int force) +static int update_info_file(char *path, + int (*generate)(struct update_info_ctx *), + int force) { char *tmp = mkpathdup("%s_XXXXXX", path); int ret = -1; int fd = -1; - FILE *fp = NULL, *to_close; - int do_update; + FILE *to_close; + struct update_info_ctx uic = { + .cur_fp = NULL, + .old_fp = NULL, + .cur_sb = STRBUF_INIT, + .old_sb = STRBUF_INIT + }; safe_create_leading_directories(path); fd = git_mkstemp_mode(tmp, 0666); if (fd < 0) goto out; - to_close = fp = fdopen(fd, "w+"); - if (!fp) + to_close = uic.cur_fp = fdopen(fd, "w"); + if (!uic.cur_fp) goto out; fd = -1; - ret = generate(fp); + + /* no problem on ENOENT and old_fp == NULL, it's stale, now */ + if (!force) + uic.old_fp = fopen_or_warn(path, "r"); + + /* + * uic_printf will compare incremental comparison aginst old_fp + * and mark uic as stale if needed + */ + ret = generate(&uic); if (ret) goto out; - do_update = force || files_differ(fp, path); - fp = NULL; + /* new file may be shorter than the old one, check here */ + if (!uic_is_stale(&uic)) { + struct stat st; + long new_len = ftell(uic.cur_fp); + int old_fd = fileno(uic.old_fp); + + if (new_len < 0) { + ret = -1; + goto out; + } + if (fstat(old_fd, &st) || (st.st_size != (size_t)new_len)) + uic_mark_stale(&uic); + } + + uic.cur_fp = NULL; if (fclose(to_close)) goto out; - if (do_update) { + + if (uic_is_stale(&uic)) { if (adjust_shared_perm(tmp) < 0) goto out; if (rename(tmp, path) < 0) @@ -87,40 +130,44 @@ static int update_info_file(char *path, int (*generate)(FILE *), int force) out: if (ret) { error_errno("unable to update %s", path); - if (fp) - fclose(fp); + if (uic.cur_fp) + fclose(uic.cur_fp); else if (fd >= 0) close(fd); unlink(tmp); } free(tmp); + if (uic.old_fp) + fclose(uic.old_fp); + strbuf_release(&uic.old_sb); + strbuf_release(&uic.cur_sb); return ret; } static int add_info_ref(const char *path, const struct object_id *oid, int flag, void *cb_data) { - FILE *fp = cb_data; + struct update_info_ctx *uic = cb_data; struct object *o = parse_object(the_repository, oid); if (!o) return -1; - if (fprintf(fp, "%s %s\n", oid_to_hex(oid), path) < 0) + if (uic_printf(uic, "%s %s\n", oid_to_hex(oid), path) < 0) return -1; if (o->type == OBJ_TAG) { o = deref_tag(the_repository, o, path, 0); if (o) - if (fprintf(fp, "%s %s^{}\n", + if (uic_printf(uic, "%s %s^{}\n", oid_to_hex(&o->oid), path) < 0) return -1; } return 0; } -static int generate_info_refs(FILE *fp) +static int generate_info_refs(struct update_info_ctx *uic) { - return for_each_ref(add_info_ref, fp); + return for_each_ref(add_info_ref, uic); } static int update_info_refs(int force) @@ -281,14 +328,14 @@ static void free_pack_info(void) free(info); } -static int write_pack_info_file(FILE *fp) +static int write_pack_info_file(struct update_info_ctx *uic) { int i; for (i = 0; i < num_pack; i++) { - if (fprintf(fp, "P %s\n", pack_basename(info[i]->p)) < 0) + if (uic_printf(uic, "P %s\n", pack_basename(info[i]->p)) < 0) return -1; } - if (fputc('\n', fp) == EOF) + if (uic_printf(uic, "\n") < 0) return -1; return 0; } Documentation/git-update-server-info.txt | 11 +- server-info.c | 140 +++++++++++++++++++---- t/t5200-update-server-info.sh | 41 +++++++ 3 files changed, 158 insertions(+), 34 deletions(-) create mode 100755 t/t5200-update-server-info.sh diff --git a/Documentation/git-update-server-info.txt b/Documentation/git-update-server-info.txt index bd0e36492f..969bb2e15f 100644 --- a/Documentation/git-update-server-info.txt +++ b/Documentation/git-update-server-info.txt @@ -9,7 +9,7 @@ git-update-server-info - Update auxiliary info file to help dumb servers SYNOPSIS -------- [verse] -'git update-server-info' [--force] +'git update-server-info' DESCRIPTION ----------- @@ -19,15 +19,6 @@ $GIT_OBJECT_DIRECTORY/info directories to help clients discover what references and packs the server has. This command generates such auxiliary files. - -OPTIONS -------- - --f:: ---force:: - Update the info files from scratch. - - OUTPUT ------ diff --git a/server-info.c b/server-info.c index 41274d098b..e68f785c2f 100644 --- a/server-info.c +++ b/server-info.c @@ -6,82 +6,174 @@ #include "tag.h" #include "packfile.h" #include "object-store.h" +#include "strbuf.h" + +struct update_info_ctx { + FILE *cur_fp; + FILE *old_fp; /* becomes NULL if it differs from cur_fp */ + struct strbuf cur_sb; + struct strbuf old_sb; +}; + +static void uic_mark_stale(struct update_info_ctx *uic) +{ + fclose(uic->old_fp); + uic->old_fp = NULL; +} + +static int uic_is_stale(const struct update_info_ctx *uic) +{ + return uic->old_fp == NULL; +} + +static int uic_printf(struct update_info_ctx *uic, const char *fmt, ...) +{ + va_list ap; + int ret = -1; + + va_start(ap, fmt); + + if (uic_is_stale(uic)) { + ret = vfprintf(uic->cur_fp, fmt, ap); + } else { + ssize_t r; + struct strbuf *cur = &uic->cur_sb; + struct strbuf *old = &uic->old_sb; + + strbuf_reset(cur); + strbuf_vinsertf(cur, 0, fmt, ap); + + strbuf_reset(old); + strbuf_grow(old, cur->len); + r = fread(old->buf, 1, cur->len, uic->old_fp); + if (r != cur->len || memcmp(old->buf, cur->buf, r)) + uic_mark_stale(uic); + + if (fwrite(cur->buf, 1, cur->len, uic->cur_fp) == cur->len) + ret = 0; + } + + va_end(ap); + + return ret; +} /* * Create the file "path" by writing to a temporary file and renaming * it into place. The contents of the file come from "generate", which * should return non-zero if it encounters an error. */ -static int update_info_file(char *path, int (*generate)(FILE *)) +static int update_info_file(char *path, + int (*generate)(struct update_info_ctx *), + int force) { char *tmp = mkpathdup("%s_XXXXXX", path); int ret = -1; int fd = -1; - FILE *fp = NULL, *to_close; + FILE *to_close; + struct update_info_ctx uic = { + .cur_fp = NULL, + .old_fp = NULL, + .cur_sb = STRBUF_INIT, + .old_sb = STRBUF_INIT + }; safe_create_leading_directories(path); fd = git_mkstemp_mode(tmp, 0666); if (fd < 0) goto out; - to_close = fp = fdopen(fd, "w"); - if (!fp) + to_close = uic.cur_fp = fdopen(fd, "w"); + if (!uic.cur_fp) goto out; fd = -1; - ret = generate(fp); + + /* no problem on ENOENT and old_fp == NULL, it's stale, now */ + if (!force) + uic.old_fp = fopen_or_warn(path, "r"); + + /* + * uic_printf will compare incremental comparison aginst old_fp + * and mark uic as stale if needed + */ + ret = generate(&uic); if (ret) goto out; - fp = NULL; + + /* new file may be shorter than the old one, check here */ + if (!uic_is_stale(&uic)) { + struct stat st; + long new_len = ftell(uic.cur_fp); + int old_fd = fileno(uic.old_fp); + + if (new_len < 0) { + ret = -1; + goto out; + } + if (fstat(old_fd, &st) || (st.st_size != (size_t)new_len)) + uic_mark_stale(&uic); + } + + uic.cur_fp = NULL; if (fclose(to_close)) goto out; - if (adjust_shared_perm(tmp) < 0) - goto out; - if (rename(tmp, path) < 0) - goto out; + + if (uic_is_stale(&uic)) { + if (adjust_shared_perm(tmp) < 0) + goto out; + if (rename(tmp, path) < 0) + goto out; + } else { + unlink(tmp); + } ret = 0; out: if (ret) { error_errno("unable to update %s", path); - if (fp) - fclose(fp); + if (uic.cur_fp) + fclose(uic.cur_fp); else if (fd >= 0) close(fd); unlink(tmp); } free(tmp); + if (uic.old_fp) + fclose(uic.old_fp); + strbuf_release(&uic.old_sb); + strbuf_release(&uic.cur_sb); return ret; } static int add_info_ref(const char *path, const struct object_id *oid, int flag, void *cb_data) { - FILE *fp = cb_data; + struct update_info_ctx *uic = cb_data; struct object *o = parse_object(the_repository, oid); if (!o) return -1; - if (fprintf(fp, "%s %s\n", oid_to_hex(oid), path) < 0) + if (uic_printf(uic, "%s %s\n", oid_to_hex(oid), path) < 0) return -1; if (o->type == OBJ_TAG) { o = deref_tag(the_repository, o, path, 0); if (o) - if (fprintf(fp, "%s %s^{}\n", + if (uic_printf(uic, "%s %s^{}\n", oid_to_hex(&o->oid), path) < 0) return -1; } return 0; } -static int generate_info_refs(FILE *fp) +static int generate_info_refs(struct update_info_ctx *uic) { - return for_each_ref(add_info_ref, fp); + return for_each_ref(add_info_ref, uic); } -static int update_info_refs(void) +static int update_info_refs(int force) { char *path = git_pathdup("info/refs"); - int ret = update_info_file(path, generate_info_refs); + int ret = update_info_file(path, generate_info_refs, force); free(path); return ret; } @@ -236,14 +328,14 @@ static void free_pack_info(void) free(info); } -static int write_pack_info_file(FILE *fp) +static int write_pack_info_file(struct update_info_ctx *uic) { int i; for (i = 0; i < num_pack; i++) { - if (fprintf(fp, "P %s\n", pack_basename(info[i]->p)) < 0) + if (uic_printf(uic, "P %s\n", pack_basename(info[i]->p)) < 0) return -1; } - if (fputc('\n', fp) == EOF) + if (uic_printf(uic, "\n") < 0) return -1; return 0; } @@ -254,7 +346,7 @@ static int update_info_packs(int force) int ret; init_pack_info(infofile, force); - ret = update_info_file(infofile, write_pack_info_file); + ret = update_info_file(infofile, write_pack_info_file, force); free_pack_info(); free(infofile); return ret; @@ -269,7 +361,7 @@ int update_server_info(int force) */ int errs = 0; - errs = errs | update_info_refs(); + errs = errs | update_info_refs(force); errs = errs | update_info_packs(force); /* remove leftover rev-cache file if there is any */ diff --git a/t/t5200-update-server-info.sh b/t/t5200-update-server-info.sh new file mode 100755 index 0000000000..21a58eecb9 --- /dev/null +++ b/t/t5200-update-server-info.sh @@ -0,0 +1,41 @@ +#!/bin/sh + +test_description='Test git update-server-info' + +. ./test-lib.sh + +test_expect_success 'setup' 'test_commit file' + +test_expect_success 'create info/refs' ' + git update-server-info && + test_path_is_file .git/info/refs +' + +test_expect_success 'modify and store mtime' ' + test-tool chmtime =0 .git/info/refs && + test-tool chmtime --get .git/info/refs >a +' + +test_expect_success 'info/refs is not needlessly overwritten' ' + git update-server-info && + test-tool chmtime --get .git/info/refs >b && + test_cmp a b +' + +test_expect_success 'info/refs can be forced to update' ' + git update-server-info -f && + test-tool chmtime --get .git/info/refs >b && + ! test_cmp a b +' + +test_expect_success 'info/refs updates when changes are made' ' + test-tool chmtime =0 .git/info/refs && + test-tool chmtime --get .git/info/refs >b && + test_cmp a b && + git update-ref refs/heads/foo HEAD && + git update-server-info && + test-tool chmtime --get .git/info/refs >b && + ! test_cmp a b +' + +test_done -- EW ^ permalink raw reply related [flat|nested] 26+ messages in thread
end of thread, other threads:[~2019-05-24 8:18 UTC | newest] Thread overview: 26+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2019-05-11 1:34 [PATCH] update-server-info: avoid needless overwrites Eric Wong 2019-05-11 7:35 ` Eric Sunshine 2019-05-11 20:47 ` [PATCH v2] " Eric Wong 2019-05-11 21:17 ` [PATCH] " Eric Wong 2019-05-11 23:37 ` Ævar Arnfjörð Bjarmason 2019-05-12 0:38 ` Eric Wong 2019-05-12 4:08 ` Jeff King 2019-05-12 7:16 ` Ævar Arnfjörð Bjarmason 2019-05-14 9:47 ` Jeff King 2019-05-14 10:33 ` Ævar Arnfjörð Bjarmason 2019-05-14 11:24 ` Jeff King 2019-05-14 11:57 ` Ævar Arnfjörð Bjarmason 2019-05-14 11:50 ` Eric Wong 2019-05-14 12:13 ` dumb HTTP things I want to do Eric Wong 2019-05-14 12:27 ` Jeff King 2019-05-14 12:19 ` [PATCH] update-server-info: avoid needless overwrites Ævar Arnfjörð Bjarmason 2019-05-14 12:29 ` Jeff King 2019-05-15 0:45 ` [PATCH 2/1] server-info: conditionally update on fetch Eric Wong 2019-05-15 20:38 ` [WIP] repack leaving stale entries in objects/info/packs Eric Wong 2019-05-15 21:48 ` Jeff King 2019-05-23 8:59 ` [PATCH] server-info: do not list unlinked packs Eric Wong 2019-05-23 10:24 ` Jeff King 2019-05-23 17:27 ` [PATCH v2] " Eric Wong 2019-05-24 6:05 ` Jeff King 2019-05-24 7:34 ` Ævar Arnfjörð Bjarmason 2019-05-13 23:17 ` [PATCH v3] update-server-info: avoid needless overwrites Eric Wong
Code repositories for project(s) associated with this public inbox https://80x24.org/mirrors/git.git This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).