git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
	plamen.totev@abv.bg, l.s.r@web.de,
	"Eric Sunshine" <sunshine@sunshineco.com>,
	tboegi@web.de, "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH v4 05/10] grep/icase: avoid kwsset when -F is specified
Date: Fri, 21 Aug 2015 19:47:15 +0700	[thread overview]
Message-ID: <1440161240-28554-6-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <1440161240-28554-1-git-send-email-pclouds@gmail.com>

Similar to the previous commit, we can't use kws on icase search
outside ascii range. But we can't simply pass the pattern to
regcomp/pcre like the previous commit because it may contain regex
special characters, so we need to quote the regex first.

To avoid misquote traps that could lead to undefined behavior, we
always stick to basic regex engine in this case. We don't need fancy
features for grepping a literal string anyway.

basic_regex_quote_buf() assumes that if the pattern is in a multibyte
encoding, ascii chars must be unambiguously encoded as single
bytes. This is true at least for UTF-8. For others, let's wait until
people yell up. Chances are nobody uses multibyte, non utf-8 charsets
any more..

Helped-by: René Scharfe <l.s.r@web.de>
Noticed-by: Plamen Totev <plamen.totev@abv.bg>
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 grep.c                          | 25 ++++++++++++++++++++++++-
 quote.c                         | 37 +++++++++++++++++++++++++++++++++++++
 quote.h                         |  1 +
 t/t7812-grep-icase-non-ascii.sh | 26 ++++++++++++++++++++++++++
 4 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/grep.c b/grep.c
index d795b0e..8fce54f 100644
--- a/grep.c
+++ b/grep.c
@@ -5,6 +5,7 @@
 #include "diff.h"
 #include "diffcore.h"
 #include "commit.h"
+#include "quote.h"
 
 static int grep_source_load(struct grep_source *gs);
 static int grep_source_is_binary(struct grep_source *gs);
@@ -397,6 +398,24 @@ static int is_fixed(const char *s, size_t len)
 	return 1;
 }
 
+static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt)
+{
+	struct strbuf sb = STRBUF_INIT;
+	int err;
+
+	basic_regex_quote_buf(&sb, p->pattern);
+	err = regcomp(&p->regexp, sb.buf, opt->regflags & ~REG_EXTENDED);
+	if (opt->debug)
+		fprintf(stderr, "fixed%s\n", sb.buf);
+	strbuf_release(&sb);
+	if (err) {
+		char errbuf[1024];
+		regerror(err, &p->regexp, errbuf, 1024);
+		regfree(&p->regexp);
+		compile_regexp_failed(p, errbuf);
+	}
+}
+
 static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
 {
 	int icase_non_ascii;
@@ -411,7 +430,11 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
 	if (!icase_non_ascii && is_fixed(p->pattern, p->patternlen))
 		p->fixed = 1;
 	else if (opt->fixed) {
-		p->fixed = 1;
+		p->fixed = !icase_non_ascii;
+		if (!p->fixed) {
+			compile_fixed_regexp(p, opt);
+			return;
+		}
 	} else
 		p->fixed = 0;
 
diff --git a/quote.c b/quote.c
index 7920e18..43a8057 100644
--- a/quote.c
+++ b/quote.c
@@ -439,3 +439,40 @@ void tcl_quote_buf(struct strbuf *sb, const char *src)
 	}
 	strbuf_addch(sb, '"');
 }
+
+void basic_regex_quote_buf(struct strbuf *sb, const char *src)
+{
+	char c;
+
+	if (*src == '^') {
+		/* only beginning '^' is special and needs quoting */
+		strbuf_addch(sb, '\\');
+		strbuf_addch(sb, *src++);
+	}
+	if (*src == '*')
+		/* beginning '*' is not special, no quoting */
+		strbuf_addch(sb, *src++);
+
+	while ((c = *src++)) {
+		switch (c) {
+		case '[':
+		case '.':
+		case '\\':
+		case '*':
+			strbuf_addch(sb, '\\');
+			strbuf_addch(sb, c);
+			break;
+
+		case '$':
+			/* only the end '$' is special and needs quoting */
+			if (*src == '\0')
+				strbuf_addch(sb, '\\');
+			strbuf_addch(sb, c);
+			break;
+
+		default:
+			strbuf_addch(sb, c);
+			break;
+		}
+	}
+}
diff --git a/quote.h b/quote.h
index 99e04d3..362d315 100644
--- a/quote.h
+++ b/quote.h
@@ -67,5 +67,6 @@ extern char *quote_path_relative(const char *in, const char *prefix,
 extern void perl_quote_buf(struct strbuf *sb, const char *src);
 extern void python_quote_buf(struct strbuf *sb, const char *src);
 extern void tcl_quote_buf(struct strbuf *sb, const char *src);
+extern void basic_regex_quote_buf(struct strbuf *sb, const char *src);
 
 #endif
diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
index 6eff490..aba6b15 100755
--- a/t/t7812-grep-icase-non-ascii.sh
+++ b/t/t7812-grep-icase-non-ascii.sh
@@ -20,4 +20,30 @@ test_expect_success REGEX_LOCALE 'grep literal string, no -F' '
 	git grep -i "TILRAUN: HALLÓ HEIMUR!"
 '
 
+test_expect_success REGEX_LOCALE 'grep literal string, with -F' '
+	git grep --debug -i -F "TILRAUN: Halló Heimur!"  2>&1 >/dev/null |
+		 grep fixed >debug1 &&
+	echo "fixedTILRAUN: Halló Heimur!" >expect1 &&
+	test_cmp expect1 debug1 &&
+
+	git grep --debug -i -F "TILRAUN: HALLÓ HEIMUR!"  2>&1 >/dev/null |
+		 grep fixed >debug2 &&
+	echo "fixedTILRAUN: HALLÓ HEIMUR!" >expect2 &&
+	test_cmp expect2 debug2
+'
+
+test_expect_success REGEX_LOCALE 'grep string with regex, with -F' '
+	printf "^*TILR^AUN:.* \\Halló \$He[]imur!\$" >file &&
+
+	git grep --debug -i -F "^*TILR^AUN:.* \\Halló \$He[]imur!\$" 2>&1 >/dev/null |
+		 grep fixed >debug1 &&
+	echo "fixed\\^*TILR^AUN:\\.\\* \\\\Halló \$He\\[]imur!\\\$" >expect1 &&
+	test_cmp expect1 debug1 &&
+
+	git grep --debug -i -F "^*TILR^AUN:.* \\HALLÓ \$HE[]IMUR!\$"  2>&1 >/dev/null |
+		 grep fixed >debug2 &&
+	echo "fixed\\^*TILR^AUN:\\.\\* \\\\HALLÓ \$HE\\[]IMUR!\\\$" >expect2 &&
+	test_cmp expect2 debug2
+'
+
 test_done
-- 
2.3.0.rc1.137.g477eb31

  parent reply	other threads:[~2015-08-21 12:58 UTC|newest]

Thread overview: 53+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-07-06 11:28 Git grep does not support multi-byte characters (like UTF-8) Plamen Totev
2015-07-06 12:23 ` Duy Nguyen
2015-07-07  8:58   ` Plamen Totev
2015-07-07 12:22     ` Duy Nguyen
2015-07-07 16:07     ` Junio C Hamano
2015-07-07 18:08       ` Plamen Totev
2015-07-08  2:19         ` Duy Nguyen
2015-07-08  4:52           ` Junio C Hamano
2015-07-06 12:42 ` [PATCH] grep: use regcomp() for icase search with non-ascii patterns Nguyễn Thái Ngọc Duy
2015-07-06 20:10   ` René Scharfe
2015-07-06 23:02     ` Duy Nguyen
2015-07-07 14:25       ` Plamen Totev
2015-07-08 10:38   ` [PATCH v2 0/9] icase match on non-ascii Nguyễn Thái Ngọc Duy
2015-07-08 10:38     ` [PATCH v2 1/9] grep: allow -F -i combination Nguyễn Thái Ngọc Duy
2015-07-08 10:38     ` [PATCH v2 2/9] grep: break down an "if" stmt in preparation for next changes Nguyễn Thái Ngọc Duy
2015-07-08 10:38     ` [PATCH v2 3/9] grep/icase: avoid kwsset on literal non-ascii strings Nguyễn Thái Ngọc Duy
2015-07-08 10:38     ` [PATCH v2 4/9] grep/icase: avoid kwsset when -F is specified Nguyễn Thái Ngọc Duy
2015-07-08 10:38     ` [PATCH v2 5/9] grep/pcre: prepare locale-dependent tables for icase matching Nguyễn Thái Ngọc Duy
2015-07-08 11:00       ` Duy Nguyen
2015-07-08 10:38     ` [PATCH v2 6/9] gettext: add is_utf8_locale() Nguyễn Thái Ngọc Duy
2015-07-08 10:38     ` [PATCH v2 7/9] grep/pcre: support utf-8 Nguyễn Thái Ngọc Duy
2015-07-11  8:07       ` Plamen Totev
2015-07-08 10:38     ` [PATCH v2 8/9] diffcore-pickaxe: "share" regex error handling code Nguyễn Thái Ngọc Duy
2015-07-08 10:38     ` [PATCH v2 9/9] diffcore-pickaxe: support case insensitive match on non-ascii Nguyễn Thái Ngọc Duy
2015-07-09 22:55       ` Eric Sunshine
2015-07-08 11:32     ` [PATCH v2 0/9] icase " Torsten Bögershausen
2015-07-08 12:13       ` Duy Nguyen
2015-07-08 15:36     ` Junio C Hamano
2015-07-08 23:28       ` Duy Nguyen
2015-07-14 13:24     ` [PATCH v3 " Nguyễn Thái Ngọc Duy
2015-07-14 13:24       ` [PATCH v3 1/9] grep: allow -F -i combination Nguyễn Thái Ngọc Duy
2015-07-14 13:24       ` [PATCH v3 2/9] grep: break down an "if" stmt in preparation for next changes Nguyễn Thái Ngọc Duy
2015-07-14 13:24       ` [PATCH v3 3/9] grep/icase: avoid kwsset on literal non-ascii strings Nguyễn Thái Ngọc Duy
2015-07-14 13:24       ` [PATCH v3 4/9] grep/icase: avoid kwsset when -F is specified Nguyễn Thái Ngọc Duy
2015-07-14 13:24       ` [PATCH v3 5/9] grep/pcre: prepare locale-dependent tables for icase matching Nguyễn Thái Ngọc Duy
2015-07-14 13:24       ` [PATCH v3 6/9] gettext: add is_utf8_locale() Nguyễn Thái Ngọc Duy
2015-07-14 13:24       ` [PATCH v3 7/9] grep/pcre: support utf-8 Nguyễn Thái Ngọc Duy
2015-07-14 13:24       ` [PATCH v3 8/9] diffcore-pickaxe: "share" regex error handling code Nguyễn Thái Ngọc Duy
2015-07-14 13:24       ` [PATCH v3 9/9] diffcore-pickaxe: support case insensitive match on non-ascii Nguyễn Thái Ngọc Duy
2015-07-14 16:42       ` [PATCH v3 0/9] icase " Torsten Bögershausen
2015-07-15  9:39         ` Duy Nguyen
2015-07-15 19:51           ` Torsten Bögershausen
2015-08-21 12:47       ` [PATCH v4 00/10] " Nguyễn Thái Ngọc Duy
2015-08-21 12:47         ` [PATCH v4 01/10] grep: allow -F -i combination Nguyễn Thái Ngọc Duy
2015-08-21 12:47         ` [PATCH v4 02/10] grep: break down an "if" stmt in preparation for next changes Nguyễn Thái Ngọc Duy
2015-08-21 12:47         ` [PATCH v4 03/10] test-regex: expose full regcomp() to the command line Nguyễn Thái Ngọc Duy
2015-08-21 12:47         ` [PATCH v4 04/10] grep/icase: avoid kwsset on literal non-ascii strings Nguyễn Thái Ngọc Duy
2015-08-21 12:47         ` Nguyễn Thái Ngọc Duy [this message]
2015-08-21 12:47         ` [PATCH v4 06/10] grep/pcre: prepare locale-dependent tables for icase matching Nguyễn Thái Ngọc Duy
2015-08-21 12:47         ` [PATCH v4 07/10] gettext: add is_utf8_locale() Nguyễn Thái Ngọc Duy
2015-08-21 12:47         ` [PATCH v4 08/10] grep/pcre: support utf-8 Nguyễn Thái Ngọc Duy
2015-08-21 12:47         ` [PATCH v4 09/10] diffcore-pickaxe: "share" regex error handling code Nguyễn Thái Ngọc Duy
2015-08-21 12:47         ` [PATCH v4 10/10] diffcore-pickaxe: support case insensitive match on non-ascii Nguyễn Thái Ngọc Duy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1440161240-28554-6-git-send-email-pclouds@gmail.com \
    --to=pclouds@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=l.s.r@web.de \
    --cc=plamen.totev@abv.bg \
    --cc=sunshine@sunshineco.com \
    --cc=tboegi@web.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).