unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Carlos O'Donell via Libc-alpha <libc-alpha@sourceware.org>
To: libc-alpha@sourceware.org
Subject: [PATCH v4 1/3] Add support for locales with zero collation rules.
Date: Thu, 29 Jul 2021 02:35:13 -0400	[thread overview]
Message-ID: <20210729063515.1541388-2-carlos@redhat.com> (raw)
In-Reply-To: <20210729063515.1541388-1-carlos@redhat.com>

While there is code to handle 'nrules == 0' in various locations
within posix/fnmatch_loop.c, posix/regcomp.c and posix/regexec.c,
these conditionals do not work.  The only collation with zero
rules in effect today is the builtin C/POSIX locale which is
built by hand, and despite have zero rules it has a collseqmb
and collseqwc tables stored in the locale data. These tables are
simple identity tables which are not actually required and could
be removed at a later date after this change.  The changes are in
order to prepare for C.UTF-8 which has zero rules and has no
collation sequence tables (multibyte or widechar).

No regressions on x86_64 or i686.
---
 posix/fnmatch_loop.c | 95 +++++++++++++++++++++++++++-----------------
 posix/regcomp.c      | 12 +++---
 posix/regexec.c      | 85 ++++++++++++++++++---------------------
 3 files changed, 104 insertions(+), 88 deletions(-)

diff --git a/posix/fnmatch_loop.c b/posix/fnmatch_loop.c
index 7f938af590..547952f0a9 100644
--- a/posix/fnmatch_loop.c
+++ b/posix/fnmatch_loop.c
@@ -51,6 +51,7 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
     _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQMB);
 # endif
 #endif
+  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
 
   while ((c = *p++) != L_('\0'))
     {
@@ -324,8 +325,6 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
                        diagnose a "used initialized" in a dead branch in the
                        findidx function.  */
                     UCHAR str;
-                    uint32_t nrules =
-                      _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
                     const CHAR *startp = p;
 
                     c = *++p;
@@ -437,8 +436,6 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
 
                     if (c == L_('[') && *p == L_('.'))
                       {
-                        uint32_t nrules =
-                          _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
                         const CHAR *startp = p;
                         size_t c1 = 0;
 
@@ -600,42 +597,51 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
                     if (c == L_('-') && *p != L_(']'))
                       {
 #if _LIBC
-                        /* We have to find the collation sequence
-                           value for C.  Collation sequence is nothing
-                           we can regularly access.  The sequence
-                           value is defined by the order in which the
-                           definitions of the collation values for the
-                           various characters appear in the source
-                           file.  A strange concept, nowhere
-                           documented.  */
-                        uint32_t fcollseq;
-                        uint32_t lcollseq;
+			/* We must find the collation sequence values for
+			   the low part of the range, the high part of the
+			   range and the searched value FN.  We do this by
+			   using the POSIX concept of Collation Element
+			   Ordering, which is the defined order of elements
+			   in the source locale.  FCOLLSEQ is the searched
+			   element in the range, while LCOLLSEQ is the low
+			   element in the range.  If we have no collation
+			   rules (nrules == 0) then we must fall back to a
+			   basic code point value for the collation
+			   sequence value (which is correct for ASCII and
+			   UTF-8).  We must never use collseq if nrules ==
+			   0 since none of the tables we need will be
+			   present in the compiled binary locale.  We start
+			   with fcollseq and lcollseq at unknown collation
+			   sequences.  We only compute hcollseq, the high
+			   part of the range if required.  */
+                        uint32_t fcollseq = ~((uint32_t) 0);
+                        uint32_t lcollseq = ~((uint32_t) 0);
                         UCHAR cend = *p++;
 
+			if (nrules != 0)
+			  {
 # if WIDE_CHAR_VERSION
-                        /* Search in the 'names' array for the characters.  */
-                        fcollseq = __collseq_table_lookup (collseq, fn);
-                        if (fcollseq == ~((uint32_t) 0))
-                          /* XXX We don't know anything about the character
-                             we are supposed to match.  This means we are
-                             failing.  */
-                          goto range_not_matched;
-
-                        if (is_seqval)
-                          lcollseq = cold;
-                        else
-                          lcollseq = __collseq_table_lookup (collseq, cold);
+			    /* Search the collation data for the character.  */
+			    fcollseq = __collseq_table_lookup (collseq, fn);
+			    if (fcollseq == ~((uint32_t) 0))
+			      /* We don't know anything about the character
+				 we are supposed to match.  This means we are
+				 failing.  */
+			      goto range_not_matched;
+
+			    if (is_seqval)
+			      lcollseq = cold;
+			    else
+			      lcollseq = __collseq_table_lookup (collseq, cold);
 # else
-                        fcollseq = collseq[fn];
-                        lcollseq = is_seqval ? cold : collseq[(UCHAR) cold];
+			    fcollseq = collseq[fn];
+			    lcollseq = is_seqval ? cold : collseq[(UCHAR) cold];
 # endif
+			  }
 
                         is_seqval = false;
                         if (cend == L_('[') && *p == L_('.'))
                           {
-                            uint32_t nrules =
-                              _NL_CURRENT_WORD (LC_COLLATE,
-                                                _NL_COLLATE_NRULES);
                             const CHAR *startp = p;
                             size_t c1 = 0;
 
@@ -752,14 +758,20 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
                             cend = FOLD (cend);
                           }
 
-                        /* XXX It is not entirely clear to me how to handle
-                           characters which are not mentioned in the
-                           collation specification.  */
-                        if (
+			/* If we have rules, and the low sequence is lower than
+			   the value of the searched sequence then we must
+			   lookup the high collation sequence value and
+			   determine if the fcollseq falls within the range.
+			   If hcollseq is unknown then we could still match
+			   fcollseq on the low end of the range.  If lcollseq
+			   if unknown (0xffffffff) we will still fail to
+			   match, but in the future we might consider matching
+			   the high end of the range on an exact match.  */
+                        if (nrules != 0 && (
 # if WIDE_CHAR_VERSION
                             lcollseq == 0xffffffff ||
 # endif
-                            lcollseq <= fcollseq)
+                            lcollseq <= fcollseq))
                           {
                             /* We have to look at the upper bound.  */
                             uint32_t hcollseq;
@@ -789,6 +801,17 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
                             if (lcollseq <= hcollseq && fcollseq <= hcollseq)
                               goto matched;
                           }
+
+			/* No rules, but we have a range.  */
+			if (nrules == 0)
+			  {
+			    if (cend == L_('\0'))
+			      return FNM_NOMATCH;
+
+			    /* Compare that fn is within the range.  */
+			    if ((UCHAR) cold <= fn && fn <= cend)
+			      goto matched;
+			  }
 # if WIDE_CHAR_VERSION
                       range_not_matched:
 # endif
diff --git a/posix/regcomp.c b/posix/regcomp.c
index d93698ae78..f55d20cbfd 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -2889,7 +2889,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 	  if (MB_CUR_MAX == 1)
 	  */
 	  if (nrules == 0)
-	    return collseqmb[br_elem->opr.ch];
+	    return br_elem->opr.ch;
 	  else
 	    {
 	      wint_t wc = __btowc (br_elem->opr.ch);
@@ -2900,6 +2900,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 	{
 	  if (nrules != 0)
 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
+	  else
+	    return br_elem->opr.wch;
 	}
       else if (br_elem->type == COLL_SYM)
 	{
@@ -2935,7 +2937,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 		}
 	    }
 	  else if (sym_name_len == 1)
-	    return collseqmb[br_elem->opr.name[0]];
+	    return br_elem->opr.name[0];
 	}
       return UINT_MAX;
     }
@@ -3017,7 +3019,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 	  if (MB_CUR_MAX == 1)
 	  */
 	  if (nrules == 0)
-	    ch_collseq = collseqmb[ch];
+	    ch_collseq = ch;
 	  else
 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
@@ -3103,11 +3105,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
   int token_len;
   bool first_round = true;
 #ifdef _LIBC
-  collseqmb = (const unsigned char *)
-    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
   if (nrules)
     {
+      collseqmb = (const unsigned char *)
+	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
       /*
       if (MB_CUR_MAX > 1)
       */
diff --git a/posix/regexec.c b/posix/regexec.c
index f7b4f9cfc3..6cc23831aa 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -3858,62 +3858,53 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
 }
 
 # ifdef _LIBC
+#include <assert.h>
+
 static unsigned int
 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
 {
-  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
-  if (nrules == 0)
-    {
-      if (mbs_len == 1)
-	{
-	  /* No valid character.  Match it as a single byte character.  */
-	  const unsigned char *collseq = (const unsigned char *)
-	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
-	  return collseq[mbs[0]];
-	}
-      return UINT_MAX;
-    }
-  else
-    {
-      int32_t idx;
-      const unsigned char *extra = (const unsigned char *)
+  int32_t idx;
+  const unsigned char *extra = (const unsigned char *)
 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
-      int32_t extrasize = (const unsigned char *)
+  int32_t extrasize = (const unsigned char *)
 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
+  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+
+  /* Only called from within 'if (nrules != 0)'.  */ 
+  assert (nrules != 0);
 
-      for (idx = 0; idx < extrasize;)
+  for (idx = 0; idx < extrasize;)
+    {
+      int mbs_cnt;
+      bool found = false;
+      int32_t elem_mbs_len;
+      /* Skip the name of collating element name.  */
+      idx = idx + extra[idx] + 1;
+      elem_mbs_len = extra[idx++];
+      if (mbs_len == elem_mbs_len)
 	{
-	  int mbs_cnt;
-	  bool found = false;
-	  int32_t elem_mbs_len;
-	  /* Skip the name of collating element name.  */
-	  idx = idx + extra[idx] + 1;
-	  elem_mbs_len = extra[idx++];
-	  if (mbs_len == elem_mbs_len)
-	    {
-	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
-		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
-		  break;
-	      if (mbs_cnt == elem_mbs_len)
-		/* Found the entry.  */
-		found = true;
-	    }
-	  /* Skip the byte sequence of the collating element.  */
-	  idx += elem_mbs_len;
-	  /* Adjust for the alignment.  */
-	  idx = (idx + 3) & ~3;
-	  /* Skip the collation sequence value.  */
-	  idx += sizeof (uint32_t);
-	  /* Skip the wide char sequence of the collating element.  */
-	  idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
-	  /* If we found the entry, return the sequence value.  */
-	  if (found)
-	    return *(uint32_t *) (extra + idx);
-	  /* Skip the collation sequence value.  */
-	  idx += sizeof (uint32_t);
+	  for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
+	    if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
+	      break;
+	  if (mbs_cnt == elem_mbs_len)
+	    /* Found the entry.  */
+	    found = true;
 	}
-      return UINT_MAX;
+      /* Skip the byte sequence of the collating element.  */
+      idx += elem_mbs_len;
+      /* Adjust for the alignment.  */
+      idx = (idx + 3) & ~3;
+      /* Skip the collation sequence value.  */
+      idx += sizeof (uint32_t);
+      /* Skip the wide char sequence of the collating element.  */
+      idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
+      /* If we found the entry, return the sequence value.  */
+      if (found)
+        return *(uint32_t *) (extra + idx);
+      /* Skip the collation sequence value.  */
+      idx += sizeof (uint32_t);
     }
+  return UINT_MAX;
 }
 # endif /* _LIBC */
 #endif /* RE_ENABLE_I18N */
-- 
2.31.1


  reply	other threads:[~2021-07-29  6:38 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-07-29  6:35 [PATCH v4 0/3] C.UTF-8 Carlos O'Donell via Libc-alpha
2021-07-29  6:35 ` Carlos O'Donell via Libc-alpha [this message]
2021-07-29  9:44   ` [PATCH v4 1/3] Add support for locales with zero collation rules Florian Weimer via Libc-alpha
2021-07-29 19:12     ` Paul Eggert
2021-07-29  6:35 ` [PATCH v4 2/3] Add 'strcmp_collation' support for LC_COLLATE Carlos O'Donell via Libc-alpha
2021-07-29  6:35 ` [PATCH v4 3/3] Add generic C.UTF-8 locale (Bug 17318) Carlos O'Donell via Libc-alpha
2021-07-29  7:53 ` [PATCH v4 0/3] C.UTF-8 Florian Weimer via Libc-alpha
2021-07-30  3:12   ` Carlos O'Donell via Libc-alpha
2021-08-18  8:12   ` Mike Frysinger via Libc-alpha

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/libc/involved.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210729063515.1541388-2-carlos@redhat.com \
    --to=libc-alpha@sourceware.org \
    --cc=carlos@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).