git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: "René Scharfe" <l.s.r@web.de>
To: Diomidis Spinellis <dds@aueb.gr>,
	Eric Sunshine <sunshine@sunshineco.com>,
	Git List <git@vger.kernel.org>
Cc: Junio C Hamano <gitster@pobox.com>, demerphq <demerphq@gmail.com>,
	Mario Grgic <mario_grgic@hotmail.com>
Subject: Re: regex compilation error with --color-words
Date: Sun, 2 Apr 2023 11:44:30 +0200	[thread overview]
Message-ID: <c4728fac-bea9-3794-077e-c978d99f46bf@web.de> (raw)
In-Reply-To: <b8b3777b-ee6e-d90e-3365-5cb9c9d129fe@web.de>

Am 31.03.23 um 22:44 schrieb René Scharfe:
> Am 30.03.23 um 09:55 schrieb Diomidis Spinellis:
>> On 30-Mar-23 1:55, Eric Sunshine wrote:
>>> I'm encountering a failure on macOS High Sierra 10.13.6 when using
>>> --color-words:
>>
>> The built-in word separation regular expression pattern for the Perl language fails to work with the macOS regex engine.  The same also happens with the FreeBSD one (tested on 14.0).
>>
>> The issue can be replicated through the following sequence of commands.
>>
>> git init color-words
>> cd color-words
>> echo '*.pl   diff=perl' >.gitattributes
>> echo 'print 42;' >t.pl
>> git add t.pl
>> git commit -am Add
>> git show --color-words
>
> Or in Git's own repo:
>
>    $ git log -p --color-words --no-merges '*.c'
>    Schwerwiegend: invalid regular expression: [a-zA-Z_][a-zA-Z0-9_]*|[0-9][0-9.]*([Ee][-+]?[0-9]+)?[fFlLuU]*|0[xXbB][0-9a-fA-F]+[lLuU]*|\.[0-9][0-9]*([Ee][-+]?[0-9]+)?[fFlL]?|[-+*/<>%&^|=!]=|--|\+\+|<<=?|>>=?|&&|\|\||::|->\*?|\.\*|<=>|[^[:space:]]|[<C0>-<FF>][<80>-<BF>]+
>    commit 14b9a044798ebb3858a1f1a1377309a3d6054ac8
>    [...]
>
> The error disappears when localization is turned off:
>
>    $ LANG=C git log -p --color-words --no-merges '*.c' >/dev/null
>    # just finishes without an error
>
> The issue also vanishes when the "|[\xc0-\xff][\x80-\xbf]+" part is
> removed that the macros PATTERNS and IPATTERN in userdiff.c append.
>
> So it seems regcomp(1) on macOS doesn't like invalid Unicode characters
> unless it's in ASCII mode (LANG=C).  664d44ee7f (userdiff: simplify
> word-diff safeguard, 2011-01-11) explains that this part exists to match
> a multi-byte UTF-8 character.  With a regcomp(1) that supports
> multi-byte characters natively they need to be specified differently, I
> guess, perhaps like this "[^\x00-\x7f]"?

Actually we can drop the "|[\xc0-\xff][\x80-\xbf]+" part in that case
because the "[^[:space:]]" suffices.  And we probably need to do that at
runtime because it depends on the locale.  The rather elaborate patch
below does that.  It leaks the truncated word_regex, which isn't that
bad because it's done only once per run, but certainly untidy.

I suspect/hope this can be done simpler and cleaner after refactoring
the userdiff code to allow for runtime assembly of regular expressions.

And it's regcomp(3), or rather regexec(3), not regcomp(1).

---
 userdiff.c | 38 ++++++++++++++++++++++++++++++++++++--
 userdiff.h |  2 ++
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/userdiff.c b/userdiff.c
index 09203fbc35..aa2cd150ba 100644
--- a/userdiff.c
+++ b/userdiff.c
@@ -9,6 +9,8 @@ static struct userdiff_driver *drivers;
 static int ndrivers;
 static int drivers_alloc;

+#define OR_MULTI_BYTE_CHAR "|[\xc0-\xff][\x80-\xbf]+"
+
 #define PATTERNS(lang, rx, wrx) { \
 	.name = lang, \
 	.binary = -1, \
@@ -16,7 +18,9 @@ static int drivers_alloc;
 		.pattern = rx, \
 		.cflags = REG_EXTENDED, \
 	}, \
-	.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
+	.word_regex = wrx "|[^[:space:]]" OR_MULTI_BYTE_CHAR, \
+	.is_builtin = 1, \
+	.has_multi_byte_char_fallback = 1, \
 }
 #define IPATTERN(lang, rx, wrx) { \
 	.name = lang, \
@@ -25,7 +29,9 @@ static int drivers_alloc;
 		.pattern = rx, \
 		.cflags = REG_EXTENDED | REG_ICASE, \
 	}, \
-	.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
+	.word_regex = wrx "|[^[:space:]]" OR_MULTI_BYTE_CHAR, \
+	.is_builtin = 1, \
+	.has_multi_byte_char_fallback = 1, \
 }

 /*
@@ -330,6 +336,25 @@ static int userdiff_find_by_namelen_cb(struct userdiff_driver *driver,
 	return 0;
 }

+static int regexec_support_multi_byte_chars(void)
+{
+	static const char not_space[] = "[^[:space:]]";
+	static const char utf8_multi_byte_char[] = "\xc2\xa3";
+	regex_t re;
+	regmatch_t match;
+	static int result = -1;
+
+	if (result != -1)
+		return result;
+	if (regcomp(&re, not_space, REG_EXTENDED))
+		BUG("invalid regular expression: %s", not_space);
+	result = !regexec(&re, utf8_multi_byte_char, 1, &match, 0) &&
+		match.rm_so == 0 &&
+		match.rm_eo == strlen(utf8_multi_byte_char);
+	regfree(&re);
+	return result;
+}
+
 static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t len)
 {
 	struct find_by_namelen_data udcbdata = {
@@ -337,6 +362,15 @@ static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t
 		.len = len,
 	};
 	for_each_userdiff_driver(userdiff_find_by_namelen_cb, &udcbdata);
+	if (udcbdata.driver &&
+	    udcbdata.driver->is_builtin &&
+	    udcbdata.driver->has_multi_byte_char_fallback &&
+	    regexec_support_multi_byte_chars()) {
+		const char *word_regex = udcbdata.driver->word_regex;
+		udcbdata.driver->word_regex = xmemdupz(word_regex,
+			strlen(word_regex) - strlen(OR_MULTI_BYTE_CHAR));
+		udcbdata.driver->has_multi_byte_char_fallback = 0;
+	}
 	return udcbdata.driver;
 }

diff --git a/userdiff.h b/userdiff.h
index 24419db697..83f5863d58 100644
--- a/userdiff.h
+++ b/userdiff.h
@@ -21,6 +21,8 @@ struct userdiff_driver {
 	const char *textconv;
 	struct notes_cache *textconv_cache;
 	int textconv_want_cache;
+	int is_builtin;
+	int has_multi_byte_char_fallback;
 };
 enum userdiff_driver_type {
 	USERDIFF_DRIVER_TYPE_BUILTIN = 1<<0,
--
2.40.0

  reply	other threads:[~2023-04-02  9:44 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-03-29 22:55 regex compilation error with --color-words Eric Sunshine
2023-03-30  7:55 ` Diomidis Spinellis
2023-03-31 20:44   ` René Scharfe
2023-04-02  9:44     ` René Scharfe [this message]
2023-04-03 16:29       ` Junio C Hamano
2023-04-03 19:32         ` René Scharfe
2023-04-06 20:19           ` [PATCH] userdiff: support regexec(3) with multi-byte support René Scharfe
2023-04-06 22:35             ` Johannes Sixt
2023-04-07  7:49               ` René Scharfe
2023-04-07 10:56                 ` Johannes Sixt
2023-04-07 14:41             ` D. Ben Knoble
2023-04-07 16:02               ` Junio C Hamano
2023-04-07 17:23             ` Eric Sunshine
  -- strict thread matches above, loose matches on Subject: below --
2023-04-06 13:39 regex compilation error with --color-words Benjamin
2023-04-06 16:03 Benjamin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=c4728fac-bea9-3794-077e-c978d99f46bf@web.de \
    --to=l.s.r@web.de \
    --cc=dds@aueb.gr \
    --cc=demerphq@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=mario_grgic@hotmail.com \
    --cc=sunshine@sunshineco.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).