git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: Jeff King <peff@peff.net>
To: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Cc: git@vger.kernel.org, hi-angel@yandex.ru,
	ramsay@ramsayjones.plus.com, sunshine@sunshineco.com,
	Junio C Hamano <gitster@pobox.com>,
	Johannes Schindelin <Johannes.Schindelin@gmx.de>
Subject: Re: [PATCH v4 1/2] refs.c: refactor check_refname_component()
Date: Wed, 6 Mar 2019 16:49:13 -0500	[thread overview]
Message-ID: <20190306214912.GA32630@sigill.intra.peff.net> (raw)
In-Reply-To: <20190305120834.7284-2-pclouds@gmail.com>

On Tue, Mar 05, 2019 at 07:08:33PM +0700, Nguyễn Thái Ngọc Duy wrote:

> @@ -71,11 +82,15 @@ static unsigned char refname_disposition[256] = {
>   * - it ends with a "/", or
>   * - it ends with ".lock", or
>   * - it contains a "@{" portion
> + *
> + * in which case cp_out points to the beginning of the illegal part.
>   */
> -static int check_refname_component(const char *refname, int *flags)
> +static enum refname_check_code do_check_refname_component(
> +	const char *refname, int *flags, const char **cp_out)

Hmm, OK, so we get to know what type of problem, but also the exact
character where we found it. And then we just keep mutating that char
until we have something that passes.

I can't think of any reason that wouldn't work. As you note, it's
possibly quadratic, though that might be OK for our purposes.

I had envisioned just sanitizing each character into an output buffer as
we did the checks. It does introduce some complexities, though, because
now the checking function is doing the replacement (so it has to know
the right sanitizing rule for each case).

The patch below is a rough cut at that, just for discussion.  You can
ignore the check-ref-format bits; they were just to make poking at it
easier, though perhaps we'd want something like that in the long run.

I suspect check_refname_component() could be made a bit more readable by
reordering a few bits. E.g., why do we check for a leading "." at the
_end_, after having parsed the entire rest of the component for errors?

I dunno. I think I can live with what you've got in your series, but I
figured I'd share this for the sake of completeness. If you really love
it, feel free to adapt it.

diff --git a/builtin/check-ref-format.c b/builtin/check-ref-format.c
index bc67d3f0a8..41b5434be2 100644
--- a/builtin/check-ref-format.c
+++ b/builtin/check-ref-format.c
@@ -56,6 +56,7 @@ int cmd_check_ref_format(int argc, const char **argv, const char *prefix)
 	int i;
 	int normalize = 0;
 	int flags = 0;
+	int sanitize = 0;
 	const char *refname;
 
 	if (argc == 2 && !strcmp(argv[1], "-h"))
@@ -73,13 +74,22 @@ int cmd_check_ref_format(int argc, const char **argv, const char *prefix)
 			flags &= ~REFNAME_ALLOW_ONELEVEL;
 		else if (!strcmp(argv[i], "--refspec-pattern"))
 			flags |= REFNAME_REFSPEC_PATTERN;
+		else if (!strcmp(argv[i], "--sanitize"))
+			sanitize = 1;
 		else
 			usage(builtin_check_ref_format_usage);
 	}
 	if (! (i == argc - 1))
 		usage(builtin_check_ref_format_usage);
 
 	refname = argv[i];
+	if (sanitize) {
+		struct strbuf out = STRBUF_INIT;
+		sanitize_refname(refname, &out);
+		printf("%s\n", out.buf);
+		strbuf_release(&out);
+		return 0;
+	}
 	if (normalize)
 		refname = collapse_slashes(refname);
 	if (check_refname_format(refname, flags))
diff --git a/refs.c b/refs.c
index 142888a40a..2a0c0c6338 100644
--- a/refs.c
+++ b/refs.c
@@ -72,30 +72,58 @@ static unsigned char refname_disposition[256] = {
  * - it ends with ".lock", or
  * - it contains a "@{" portion
  */
-static int check_refname_component(const char *refname, int *flags)
+static int check_refname_component(const char *refname, int *flags,
+				   struct strbuf *sanitized)
 {
 	const char *cp;
 	char last = '\0';
+	size_t component_start;
+
+	if (sanitized)
+		component_start = sanitized->len;
 
 	for (cp = refname; ; cp++) {
 		int ch = *cp & 255;
 		unsigned char disp = refname_disposition[ch];
+
+		if (sanitized && disp != 1)
+			strbuf_addch(sanitized, ch);
+
 		switch (disp) {
 		case 1:
 			goto out;
 		case 2:
-			if (last == '.')
-				return -1; /* Refname contains "..". */
+			if (last == '.') {
+				/* Refname contains "..". */
+				if (sanitized)
+					sanitized->len--; /* collapse ".." to single "." */
+				else
+					return -1;
+			}
 			break;
 		case 3:
-			if (last == '@')
-				return -1; /* Refname contains "@{". */
+			if (last == '@') {
+				/* Refname contains "@{". */
+				if (sanitized)
+					sanitized->buf[sanitized->len-1] = '-';
+				else
+					return -1;
+			}
 			break;
 		case 4:
-			return -1;
+			/* forbidden char */
+			if (sanitized)
+				sanitized->buf[sanitized->len-1] = '-';
+			else
+				return -1;
+			break;
 		case 5:
-			if (!(*flags & REFNAME_REFSPEC_PATTERN))
-				return -1; /* refspec can't be a pattern */
+			if (!(*flags & REFNAME_REFSPEC_PATTERN)) {
+				if (sanitized)
+					sanitized->buf[sanitized->len-1] = '-';
+				else
+					return -1; /* refspec can't be a pattern */
+			}
 
 			/*
 			 * Unset the pattern flag so that we only accept
@@ -109,26 +137,48 @@ static int check_refname_component(const char *refname, int *flags)
 out:
 	if (cp == refname)
 		return 0; /* Component has zero length. */
-	if (refname[0] == '.')
-		return -1; /* Component starts with '.'. */
+
+	if (refname[0] == '.') {
+		/* Component starts with '.'. */
+		if (sanitized)
+			sanitized->buf[component_start] = '-';
+		else
+			return -1;
+	}
 	if (cp - refname >= LOCK_SUFFIX_LEN &&
-	    !memcmp(cp - LOCK_SUFFIX_LEN, LOCK_SUFFIX, LOCK_SUFFIX_LEN))
-		return -1; /* Refname ends with ".lock". */
+	    !memcmp(cp - LOCK_SUFFIX_LEN, LOCK_SUFFIX, LOCK_SUFFIX_LEN)) {
+		/* Refname ends with ".lock". */
+		if (sanitized)
+			strbuf_strip_suffix(sanitized, LOCK_SUFFIX);
+		else
+			return -1;
+	}
 	return cp - refname;
 }
 
-int check_refname_format(const char *refname, int flags)
+static int check_or_sanitize_refname(const char *refname, int flags,
+				     struct strbuf *sanitized)
 {
 	int component_len, component_count = 0;
 
-	if (!strcmp(refname, "@"))
+	if (!strcmp(refname, "@")) {
 		/* Refname is a single character '@'. */
-		return -1;
+		if (sanitized)
+			strbuf_addch(sanitized, '-');
+		else
+			return -1;
+	}
 
 	while (1) {
+		if (sanitized && sanitized->len)
+			strbuf_complete(sanitized, '/');
+
 		/* We are at the start of a path component. */
-		component_len = check_refname_component(refname, &flags);
-		if (component_len <= 0)
+		component_len = check_refname_component(refname, &flags,
+							sanitized);
+		if (sanitized && component_len == 0)
+			; /* OK, omit empty component */
+		else if (component_len <= 0)
 			return -1;
 
 		component_count++;
@@ -138,13 +188,29 @@ int check_refname_format(const char *refname, int flags)
 		refname += component_len + 1;
 	}
 
-	if (refname[component_len - 1] == '.')
-		return -1; /* Refname ends with '.'. */
+	if (refname[component_len - 1] == '.') {
+		/* Refname ends with '.'. */
+		if (sanitized)
+			; /* omit ending dot */
+		else
+			return -1;
+	}
 	if (!(flags & REFNAME_ALLOW_ONELEVEL) && component_count < 2)
 		return -1; /* Refname has only one component. */
 	return 0;
 }
 
+int check_refname_format(const char *refname, int flags)
+{
+	return check_or_sanitize_refname(refname, flags, NULL);
+}
+
+void sanitize_refname(const char *refname, struct strbuf *out)
+{
+	if (check_or_sanitize_refname(refname, 0, out))
+		BUG("sanitizing refname check returned error");
+}
+
 int refname_is_safe(const char *refname)
 {
 	const char *rest;
diff --git a/refs.h b/refs.h
index 308fa1f03b..b99c309dd9 100644
--- a/refs.h
+++ b/refs.h
@@ -460,6 +460,12 @@ int for_each_reflog(each_ref_fn fn, void *cb_data);
  */
 int check_refname_format(const char *refname, int flags);
 
+/*
+ * Apply the rules from check_refname_format, but mutate the result until it
+ * is acceptable, and place the result in "out".
+ */
+void sanitize_refname(const char *refname, struct strbuf *out);
+
 const char *prettify_refname(const char *refname);
 
 char *shorten_unambiguous_ref(const char *refname, int strict);

  reply	other threads:[~2019-03-06 21:49 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-02-18 14:36 git gc fails with "unable to resolve reference" for worktree hi-angel
2019-02-18 15:02 ` Duy Nguyen
2019-02-18 15:09   ` hi-angel
2019-02-18 15:18     ` Duy Nguyen
2019-02-20 14:34       ` hi-angel
2019-02-21 11:00 ` [PATCH] worktree add: sanitize worktree names Nguyễn Thái Ngọc Duy
2019-02-21 11:28   ` Konstantin Kharlamov
2019-02-21 11:38     ` Duy Nguyen
2019-02-21 11:44       ` Konstantin Kharlamov
2019-02-21 11:52         ` Duy Nguyen
2019-02-21 13:23           ` Jeff King
2019-02-21 12:19   ` [PATCH v2 0/1] " Nguyễn Thái Ngọc Duy
2019-02-21 12:19     ` [PATCH v2 1/1] " Nguyễn Thái Ngọc Duy
2019-02-21 13:22       ` Jeff King
2019-02-21 17:41       ` Ramsay Jones
2019-02-22  9:21         ` Duy Nguyen
2019-02-26 10:58     ` [PATCH v3 0/1] " Nguyễn Thái Ngọc Duy
2019-02-26 10:58       ` [PATCH v3 1/1] " Nguyễn Thái Ngọc Duy
2019-02-27 12:08         ` Jeff King
2019-02-27 14:23           ` Eric Sunshine
2019-02-27 16:04             ` Jeff King
2019-03-03  1:22               ` Junio C Hamano
2019-03-04 11:19               ` Duy Nguyen
2019-03-04 12:04                 ` Duy Nguyen
2019-03-04 15:06         ` Johannes Schindelin
2019-03-05 12:08       ` [PATCH v4 0/2] " Nguyễn Thái Ngọc Duy
2019-03-05 12:08         ` [PATCH v4 1/2] refs.c: refactor check_refname_component() Nguyễn Thái Ngọc Duy
2019-03-06 21:49           ` Jeff King [this message]
2019-03-07 23:24             ` Eric Sunshine
2019-03-05 12:08         ` [PATCH v4 2/2] worktree add: sanitize worktree names Nguyễn Thái Ngọc Duy
2019-03-08  9:28         ` [PATCH v5 0/1] " Nguyễn Thái Ngọc Duy
2019-03-08  9:28           ` [PATCH v5 1/1] " Nguyễn Thái Ngọc Duy
2019-03-10  2:02             ` Eric Sunshine
2019-03-11  6:20               ` Junio C Hamano
2019-03-11  9:24                 ` Duy Nguyen
2019-03-11 22:39                   ` Jeff King
2019-03-12  6:32                     ` Junio C Hamano
2019-03-11  6:36             ` Junio C Hamano
2019-03-11  9:27               ` Duy Nguyen
2019-03-11 13:05             ` Johannes Schindelin
2019-03-12  6:45               ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190306214912.GA32630@sigill.intra.peff.net \
    --to=peff@peff.net \
    --cc=Johannes.Schindelin@gmx.de \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=hi-angel@yandex.ru \
    --cc=pclouds@gmail.com \
    --cc=ramsay@ramsayjones.plus.com \
    --cc=sunshine@sunshineco.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).