[PATCH 1/2] regex: port to weird isascii platforms

bug-gnulib@gnu.org mirror (unofficial)
 help / color / mirror / Atom feed

* [PATCH 1/2] regex: port to weird isascii platforms
@ 2020-09-24  0:05 Paul Eggert
  2020-09-24  0:05 ` [PATCH 2/2] regex: fix ignore-case Turkish bug Paul Eggert
  0 siblings, 1 reply; 3+ messages in thread
From: Paul Eggert @ 2020-09-24  0:05 UTC (permalink / raw)
  To: bug-gnulib; +Cc: Paul Eggert

* lib/regex_internal.h (isascii) [!_LIBC]: Supply glibc version.
---
 ChangeLog            | 5 +++++
 lib/regex_internal.h | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 188bb8f2f..d15f158ab 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2020-09-23  Paul Eggert  <eggert@cs.ucla.edu>
+
+	regex: port to weird isascii platforms
+	* lib/regex_internal.h (isascii) [!_LIBC]: Supply glibc version.
+
 2020-09-20  Norihiro Tanaka  <noritnk@kcn.ne.jp>
 
 	dfa: make dfasupported a global function
diff --git a/lib/regex_internal.h b/lib/regex_internal.h
index dbc503cd5..4a3cf779b 100644
--- a/lib/regex_internal.h
+++ b/lib/regex_internal.h
@@ -77,6 +77,14 @@
 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
 #endif
 
+/* regex code assumes isascii has its usual numeric meaning,
+   even if the portable character set uses EBCDIC encoding,
+   and even if wint_t is wider than int.  */
+#ifndef _LIBC
+# undef isascii
+# define isascii(c) (((c) & ~0x7f) == 0)
+#endif
+
 #ifdef _LIBC
 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
 #  define _RE_DEFINE_LOCALE_FUNCTIONS 1
-- 
2.25.4



^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/2] regex: fix ignore-case Turkish bug
  2020-09-24  0:05 [PATCH 1/2] regex: port to weird isascii platforms Paul Eggert
@ 2020-09-24  0:05 ` Paul Eggert
  2020-09-26 13:57   ` Bruno Haible
  0 siblings, 1 reply; 3+ messages in thread
From: Paul Eggert @ 2020-09-24  0:05 UTC (permalink / raw)
  To: bug-gnulib; +Cc: Paul Eggert

* lib/regex_internal.c (build_wcs_upper_buffer):
Do not assume that converting single-byte character to upper
yields a single-byte character.  This is not true for Turkish,
where towupper (L'i') yields L'İ', which is not single-byte.
* tests/test-regex.c (main): Test for this bug.
---
 ChangeLog            |  7 +++++++
 lib/regex_internal.c | 19 ++++++++++---------
 tests/test-regex.c   | 41 ++++++++++++++++++++++++++++++++++++-----
 3 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index d15f158ab..5c4d8f849 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
 2020-09-23  Paul Eggert  <eggert@cs.ucla.edu>
 
+	regex: fix ignore-case Turkish bug
+	* lib/regex_internal.c (build_wcs_upper_buffer):
+	Do not assume that converting single-byte character to upper
+	yields a single-byte character.  This is not true for Turkish,
+	where towupper (L'i') yields L'İ', which is not single-byte.
+	* tests/test-regex.c (main): Test for this bug.
+
 	regex: port to weird isascii platforms
 	* lib/regex_internal.h (isascii) [!_LIBC]: Supply glibc version.
 
diff --git a/lib/regex_internal.c b/lib/regex_internal.c
index e1b6b4d5a..ed0a13461 100644
--- a/lib/regex_internal.c
+++ b/lib/regex_internal.c
@@ -300,18 +300,20 @@ build_wcs_upper_buffer (re_string_t *pstr)
       while (byte_idx < end_idx)
 	{
 	  wchar_t wc;
+	  unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 
-	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
-	      && mbsinit (&pstr->cur_state))
+	  if (isascii (ch) && mbsinit (&pstr->cur_state))
 	    {
-	      /* In case of a singlebyte character.  */
-	      pstr->mbs[byte_idx]
-		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 	      /* The next step uses the assumption that wchar_t is encoded
 		 ASCII-safe: all ASCII values can be converted like this.  */
-	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
-	      ++byte_idx;
-	      continue;
+	      wchar_t wcu = __towupper (ch);
+	      if (isascii (wcu))
+		{
+		  pstr->mbs[byte_idx] = wcu;
+		  pstr->wcs[byte_idx] = wcu;
+		  byte_idx++;
+		  continue;
+		}
 	    }
 
 	  remain_len = end_idx - byte_idx;
@@ -348,7 +350,6 @@ build_wcs_upper_buffer (re_string_t *pstr)
 	    {
 	      /* It is an invalid character, an incomplete character
 		 at the end of the string, or '\0'.  Just use the byte.  */
-	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 	      pstr->mbs[byte_idx] = ch;
 	      /* And also cast it to wide char.  */
 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
diff --git a/tests/test-regex.c b/tests/test-regex.c
index d3f429aeb..b4e23c8c8 100644
--- a/tests/test-regex.c
+++ b/tests/test-regex.c
@@ -29,6 +29,15 @@
 
 #include "localcharset.h"
 
+/* Check whether it's really a UTF-8 locale.
+   On mingw, setlocale (LC_ALL, "en_US.UTF-8") succeeds but returns
+   "English_United States.1252", with locale_charset () returning "CP1252".  */
+static int
+really_utf8 (void)
+{
+  return strcmp (locale_charset (), "UTF-8") == 0;
+}
+
 int
 main (void)
 {
@@ -75,11 +84,7 @@ main (void)
           }
       }
 
-      /* Check whether it's really a UTF-8 locale.
-         On mingw, the setlocale call succeeds but returns
-         "English_United States.1252", with locale_charset() returning
-         "CP1252".  */
-      if (strcmp (locale_charset (), "UTF-8") == 0)
+      if (really_utf8 ())
         {
           /* This test is from glibc bug 15078.
              The test case is from Andreas Schwab in
@@ -119,6 +124,32 @@ main (void)
         return 1;
     }
 
+  if (setlocale (LC_ALL, "tr_TR.UTF-8") && really_utf8 ())
+    {
+      re_set_syntax (RE_SYNTAX_GREP | RE_ICASE);
+      if (re_compile_pattern ("i", 1, &regex))
+        result |= 1;
+      else
+        {
+          /* UTF-8 encoding of U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE.
+             In Turkish, this is the upper-case equivalent of ASCII "i".
+             Older versions of Gnulib failed to match "i" to U+0130 when
+             ignoring case in Turkish <https://bugs.gnu.org/43577>.  */
+          static char const data[] = "\xc4\xb0";
+
+          memset (&regs, 0, sizeof regs);
+          if (re_search (&regex, data, sizeof data - 1, 0, sizeof data - 1,
+                         &regs))
+            result |= 1;
+          regfree (&regex);
+          free (regs.start);
+          free (regs.end);
+
+          if (! setlocale (LC_ALL, "C"))
+            return 1;
+        }
+    }
+
   /* This test is from glibc bug 3957, reported by Andrew Mackey.  */
   re_set_syntax (RE_SYNTAX_EGREP | RE_HAT_LISTS_NOT_NEWLINE);
   memset (&regex, 0, sizeof regex);
-- 
2.25.4



^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH 2/2] regex: fix ignore-case Turkish bug
  2020-09-24  0:05 ` [PATCH 2/2] regex: fix ignore-case Turkish bug Paul Eggert
@ 2020-09-26 13:57   ` Bruno Haible
  0 siblings, 0 replies; 3+ messages in thread
From: Bruno Haible @ 2020-09-26 13:57 UTC (permalink / raw)
  To: bug-gnulib; +Cc: Paul Eggert

Paul Eggert wrote:
> +  if (setlocale (LC_ALL, "tr_TR.UTF-8") && really_utf8 ())
> +    {
> +      re_set_syntax (RE_SYNTAX_GREP | RE_ICASE);
> +      if (re_compile_pattern ("i", 1, &regex))
> +        result |= 1;
> +      else
> +        {
> +          /* UTF-8 encoding of U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE.
> +             In Turkish, this is the upper-case equivalent of ASCII "i".
> +             Older versions of Gnulib failed to match "i" to U+0130 when
> +             ignoring case in Turkish <https://bugs.gnu.org/43577>.  */
> +          static char const data[] = "\xc4\xb0";
> +
> +          memset (&regs, 0, sizeof regs);
> +          if (re_search (&regex, data, sizeof data - 1, 0, sizeof data - 1,
> +                         &regs))
> +            result |= 1;
> +          regfree (&regex);
> +          free (regs.start);
> +          free (regs.end);
> +
> +          if (! setlocale (LC_ALL, "C"))
> +            return 1;
> +        }
> +    }

In this test code, it is possible that the first setlocale() call succeeds
but the second one is not invoked. The effect would be that the following
tests get invoked in the Turkish locale, which may lead to very confusing
failure reports.


2020-09-26  Bruno Haible  <bruno@clisp.org>

	regex-tests: Make test more robust.
	* tests/test-regex.c (main): Make sure to revert the locale to "C" after
	the test in "tr_TR.UTF-8" locale. Exit if we can't revert it.

(diff -w)
diff --git a/tests/test-regex.c b/tests/test-regex.c
index a54f643..3a3d8f1 100644
--- a/tests/test-regex.c
+++ b/tests/test-regex.c
@@ -139,11 +139,15 @@ main (void)
         }
 
       if (! setlocale (LC_ALL, "C"))
-        return 1;
+        {
+          report_error ("setlocale \"C\" failed");
+          return exit_status;
+        }
     }
 
-  if (setlocale (LC_ALL, "tr_TR.UTF-8") && really_utf8 ()
-      && towupper (L'i') == 0x0130 /* U+0130; see below.  */)
+  if (setlocale (LC_ALL, "tr_TR.UTF-8"))
+    {
+      if (really_utf8 () && towupper (L'i') == 0x0130 /* U+0130; see below.  */)
         {
           re_set_syntax (RE_SYNTAX_GREP | RE_ICASE);
           memset (&regex, 0, sizeof regex);
@@ -168,9 +172,13 @@ main (void)
               regfree (&regex);
               free (regs.start);
               free (regs.end);
+            }
+        }
 
       if (! setlocale (LC_ALL, "C"))
+        {
           report_error ("setlocale \"C\" failed");
+          return exit_status;
         }
     }
 



^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-09-26 13:58 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-24  0:05 [PATCH 1/2] regex: port to weird isascii platforms Paul Eggert
2020-09-24  0:05 ` [PATCH 2/2] regex: fix ignore-case Turkish bug Paul Eggert
2020-09-26 13:57   ` Bruno Haible

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).