git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: "René Scharfe" <l.s.r@web.de>
To: Junio C Hamano <gitster@pobox.com>
Cc: Diomidis Spinellis <dds@aueb.gr>,
	Eric Sunshine <sunshine@sunshineco.com>,
	Git List <git@vger.kernel.org>, demerphq <demerphq@gmail.com>,
	Mario Grgic <mario_grgic@hotmail.com>
Subject: Re: regex compilation error with --color-words
Date: Mon, 3 Apr 2023 21:32:51 +0200	[thread overview]
Message-ID: <bc6e89c9-d886-c519-85b3-fbc3f4eb5528@web.de> (raw)
In-Reply-To: <xmqq5yad7wv3.fsf@gitster.g>

Am 03.04.23 um 18:29 schrieb Junio C Hamano:
> René Scharfe <l.s.r@web.de> writes:
>
>> Actually we can drop the "|[\xc0-\xff][\x80-\xbf]+" part in that case
>> because the "[^[:space:]]" suffices.  And we probably need to do that at
>> runtime because it depends on the locale.  The rather elaborate patch
>> below does that.  It leaks the truncated word_regex, which isn't that
>> bad because it's done only once per run, but certainly untidy.
>
> Small ugliness like what we see below is fine in a technology
> demonostration.
>
>> I suspect/hope this can be done simpler and cleaner after refactoring
>> the userdiff code to allow for runtime assembly of regular expressions.
>
> Do we expect "does the regcomp(3) and regexec(3) correctly match a
> non-space multi-byte UTF-8 sequence as expected?" to be the only
> choices, do we expect we will choose from only two, and do we expect
> that the differences between the MB version and fallback version to
> be the same "OR_MULTI_BYTE_CHAR may be omitted"?  For now I think
> it would be reasonable to answer yes to all three.
>
> How are .is_builtin and .has_multi_byte_char_fallback bits expected
> to be used?  For what kind of files do we expect them to be set
> differently?

.is_builtin is unnecessary here.  It is a remnant of me noticing that we
don't add "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+" to user-defined
patterns, but .has_multi_byte_char_fallback alone suffices.

>
> In the simplest case, I would imagine that we could do this
>
>  	...
>  	const char *word_regex;
> +	const char *word_regex_wo_mb;
>  	const char *textconv;
>  	...
>
> in the definition of "struct userdifif_driver", use
>
>  #define PATTERNS(lang, rx, wrx) { \
>  	...
>  	.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
> +	.word_regex_wo_mb = wrx "|[^[:space:]]", \

Ah, nice, no allocation or string manipulation at runtime at all, at
the small cost of having near-duplicate static strings.

>  }
>
> and similar for IPATTERN, and make a non-NULL .word.regex_wo_mb
> serve as the .has_multi_byte_char_fallback bit to trigger "does our
> regex engine do a good job for multi-byte?"
>
> Thanks.
>
>> diff --git a/userdiff.c b/userdiff.c
>> index 09203fbc35..aa2cd150ba 100644
>> --- a/userdiff.c
>> +++ b/userdiff.c
>> @@ -9,6 +9,8 @@ static struct userdiff_driver *drivers;
>>  static int ndrivers;
>>  static int drivers_alloc;
>>
>> +#define OR_MULTI_BYTE_CHAR "|[\xc0-\xff][\x80-\xbf]+"
>> +
>>  #define PATTERNS(lang, rx, wrx) { \
>>  	.name = lang, \
>>  	.binary = -1, \
>> @@ -16,7 +18,9 @@ static int drivers_alloc;
>>  		.pattern = rx, \
>>  		.cflags = REG_EXTENDED, \
>>  	}, \
>> -	.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
>> +	.word_regex = wrx "|[^[:space:]]" OR_MULTI_BYTE_CHAR, \
>> +	.is_builtin = 1, \
>> +	.has_multi_byte_char_fallback = 1, \
>>  }
>>  #define IPATTERN(lang, rx, wrx) { \
>>  	.name = lang, \
>> @@ -25,7 +29,9 @@ static int drivers_alloc;
>>  		.pattern = rx, \
>>  		.cflags = REG_EXTENDED | REG_ICASE, \
>>  	}, \
>> -	.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
>> +	.word_regex = wrx "|[^[:space:]]" OR_MULTI_BYTE_CHAR, \
>> +	.is_builtin = 1, \
>> +	.has_multi_byte_char_fallback = 1, \
>>  }
>>
>>  /*
>> @@ -330,6 +336,25 @@ static int userdiff_find_by_namelen_cb(struct userdiff_driver *driver,
>>  	return 0;
>>  }
>>
>> +static int regexec_support_multi_byte_chars(void)
>> +{
>> +	static const char not_space[] = "[^[:space:]]";
>> +	static const char utf8_multi_byte_char[] = "\xc2\xa3";
>> +	regex_t re;
>> +	regmatch_t match;
>> +	static int result = -1;
>> +
>> +	if (result != -1)
>> +		return result;
>> +	if (regcomp(&re, not_space, REG_EXTENDED))
>> +		BUG("invalid regular expression: %s", not_space);
>> +	result = !regexec(&re, utf8_multi_byte_char, 1, &match, 0) &&
>> +		match.rm_so == 0 &&
>> +		match.rm_eo == strlen(utf8_multi_byte_char);
>> +	regfree(&re);
>> +	return result;
>> +}
>> +
>>  static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t len)
>>  {
>>  	struct find_by_namelen_data udcbdata = {
>> @@ -337,6 +362,15 @@ static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t
>>  		.len = len,
>>  	};
>>  	for_each_userdiff_driver(userdiff_find_by_namelen_cb, &udcbdata);
>> +	if (udcbdata.driver &&
>> +	    udcbdata.driver->is_builtin &&
>> +	    udcbdata.driver->has_multi_byte_char_fallback &&
>> +	    regexec_support_multi_byte_chars()) {
>> +		const char *word_regex = udcbdata.driver->word_regex;
>> +		udcbdata.driver->word_regex = xmemdupz(word_regex,
>> +			strlen(word_regex) - strlen(OR_MULTI_BYTE_CHAR));
>> +		udcbdata.driver->has_multi_byte_char_fallback = 0;
>> +	}
>>  	return udcbdata.driver;
>>  }
>>
>> diff --git a/userdiff.h b/userdiff.h
>> index 24419db697..83f5863d58 100644
>> --- a/userdiff.h
>> +++ b/userdiff.h
>> @@ -21,6 +21,8 @@ struct userdiff_driver {
>>  	const char *textconv;
>>  	struct notes_cache *textconv_cache;
>>  	int textconv_want_cache;
>> +	int is_builtin;
>> +	int has_multi_byte_char_fallback;
>>  };
>>  enum userdiff_driver_type {
>>  	USERDIFF_DRIVER_TYPE_BUILTIN = 1<<0,
>> --
>> 2.40.0

  reply	other threads:[~2023-04-03 19:33 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-03-29 22:55 regex compilation error with --color-words Eric Sunshine
2023-03-30  7:55 ` Diomidis Spinellis
2023-03-31 20:44   ` René Scharfe
2023-04-02  9:44     ` René Scharfe
2023-04-03 16:29       ` Junio C Hamano
2023-04-03 19:32         ` René Scharfe [this message]
2023-04-06 20:19           ` [PATCH] userdiff: support regexec(3) with multi-byte support René Scharfe
2023-04-06 22:35             ` Johannes Sixt
2023-04-07  7:49               ` René Scharfe
2023-04-07 10:56                 ` Johannes Sixt
2023-04-07 14:41             ` D. Ben Knoble
2023-04-07 16:02               ` Junio C Hamano
2023-04-07 17:23             ` Eric Sunshine
  -- strict thread matches above, loose matches on Subject: below --
2023-04-06 13:39 regex compilation error with --color-words Benjamin
2023-04-06 16:03 Benjamin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bc6e89c9-d886-c519-85b3-fbc3f4eb5528@web.de \
    --to=l.s.r@web.de \
    --cc=dds@aueb.gr \
    --cc=demerphq@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=mario_grgic@hotmail.com \
    --cc=sunshine@sunshineco.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).