unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Carlos O'Donell via Libc-alpha <libc-alpha@sourceware.org>
To: libc-alpha@sourceware.org, fweimer@redhat.com
Subject: [PATCH v4 1/4] Add support for processing wide ellipsis ranges in UTF-8.
Date: Wed, 28 Apr 2021 09:00:30 -0400	[thread overview]
Message-ID: <20210428130033.3196848-2-carlos@redhat.com> (raw)
In-Reply-To: <20210428130033.3196848-1-carlos@redhat.com>

If the input charater map is UTF-8 then the ellipsis handling is
relaxed with regards to the POSIX requirement for null byte
output and instead a custom increment function is used to
correctly handle the ellipsis output to generate valid UTF-8
code points.

Developers of locales want to be able to write large ellipsis
sequences without having apriori knowledge of the encoding that
would require them to split the ellipsis to avoid null byte
output.

Tested on x86_64 and i686 without regression.
---
 locale/programs/charmap.c | 174 ++++++++++++++++++++++++++++++++++----
 1 file changed, 156 insertions(+), 18 deletions(-)

diff --git a/locale/programs/charmap.c b/locale/programs/charmap.c
index 3d51e702dc..cb134e3b8a 100644
--- a/locale/programs/charmap.c
+++ b/locale/programs/charmap.c
@@ -49,7 +49,7 @@ static void new_width (struct linereader *cmfile, struct charmap_t *result,
 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
 			      size_t nbytes, unsigned char *bytes,
 			      const char *from, const char *to,
-			      int decimal_ellipsis, int step);
+			      int decimal_ellipsis, int step, bool is_utf8);
 
 
 bool enc_not_ascii_compatible;
@@ -285,6 +285,27 @@ parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
   enum token_t ellipsis = 0;
   int step = 1;
 
+  /* POSIX explicitly requires that ellipsis processing do the
+     following: "Bytes shall be treated as unsigned octets, and carry
+     shall be propagated between the bytes as necessary to represent the
+     range."  It then goes on to say that such a declaration should
+     never be specified because it creates null bytes.  Therefore we
+     error on this condition (see charmap_new_char).  However this still
+     leaves a problem for encodings which use less than the full 8-bits,
+     like UTF-8, and in such encodings you can use an ellipsis to
+     silently and accidentally create invalid ranges.  In UTF-8 you have
+     only N-bits of the first byte and if your ellipsis covers a code
+     point range larger than this code point block the output is going
+     to be an invalid non-UTF-8 multi-byte sequence.  Thus for
+     UTF-8 we add a special ellipsis handling loop that can increment
+     UTF-8 multi-byte output effectively and for UTF-8 we allow larger
+     ellipsis ranges without error.  There may still be other encodings
+     for which the ellipsis will still generate invalid multi-byte
+     output, but not for UTF-8.  The only alternative would be to call
+     gconv for each Unicode code point in the loop to convert it to the
+     appropriate multi-byte output, but that would be slow.  */
+  bool is_utf8 = false;
+
   /* We don't want symbolic names in string to be translated.  */
   cmfile->translate_strings = 0;
 
@@ -385,9 +406,14 @@ parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
 		}
 
 	      if (nowtok == tok_code_set_name)
-		result->code_set_name = obstack_copy0 (&result->mem_pool,
-						       arg->val.str.startmb,
-						       arg->val.str.lenmb);
+		{
+		  result->code_set_name = obstack_copy0 (&result->mem_pool,
+							 arg->val.str.startmb,
+							 arg->val.str.lenmb);
+
+		  if (strcmp (result->code_set_name, "UTF-8") == 0)
+		    is_utf8 = true;
+		}
 	      else
 		result->repertoiremap = obstack_copy0 (&result->mem_pool,
 						       arg->val.str.startmb,
@@ -570,7 +596,7 @@ character sets with locking states are not supported"));
 	  else
 	    charmap_new_char (cmfile, result, now->val.charcode.nbytes,
 			      now->val.charcode.bytes, from_name, to_name,
-			      ellipsis != tok_ellipsis2, step);
+			      ellipsis != tok_ellipsis2, step, is_utf8);
 
 	  /* Ignore trailing comment silently.  */
 	  lr_ignore_rest (cmfile, 0);
@@ -929,12 +955,81 @@ charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
 	  < 0 ? NULL : (struct charseq *) result);
 }
 
+/* This function takes the Unicode code point CP and encodes it into
+   a UTF-8 byte stream that must be NBYTES long and is stored into
+   the unsigned character array at BYTES.
+
+   If CP requires more than NBYTES to be encoded then we return an
+   error of -1.
+
+   If CP is not within any of the valid Unicode code point ranges
+   then we return an error of -2.
+
+   Otherwise we return the number of bytes encoded.  */
+static int
+output_utf8_bytes (unsigned int cp, size_t nbytes, unsigned char *bytes)
+{
+  /* We need at least 1 byte.  */
+  if (nbytes < 1)
+    return -1;
+
+  /* One byte range.  */
+  if (cp >= 0x0 && cp <= 0x7f)
+    {
+      bytes[0] = cp;
+      return 1;
+    }
+
+  /* We need at least 2 bytes.  */
+  if (nbytes < 2)
+    return -1;
+
+  /* Two byte range.  */
+  if (cp >= 0x80 && cp <= 0x7ff)
+    {
+      bytes[0] = 0xc0 | ((cp & 0x07c0) >> 6);
+      bytes[1] = 0x80 | (cp & 0x003f);
+      return 2;
+    }
+
+  /* We need at least 3 bytes.  */
+  if (nbytes < 3)
+    return -1;
+
+  /* Three byte range.  Explicitly allow the surrogate range from
+     0xd800 to 0xdfff since we want consistent sorting of the invalid
+     values that might appear in UTF-8 data.  */
+  if (cp >= 0x800 && cp <= 0xffff)
+    {
+      bytes[0] = 0xe0 | ((cp & 0xf000) >> 12);
+      bytes[1] = 0x80 | ((cp & 0x0fc0) >> 6);
+      bytes[2] = 0x80 | (cp & 0x003f);
+      return 3;
+    }
+
+  /* We need at least 4 bytes.  */
+  if (nbytes < 4)
+    return -1;
+
+  /* Four byte range.  */
+  if (cp >= 0x10000 && cp <= 0x10ffff)
+    {
+      bytes[0] = 0xf0 | ((cp & 0x1c0000) >> 18);
+      bytes[1] = 0x80 | ((cp & 0x03f000) >> 12);
+      bytes[2] = 0x80 | ((cp & 0x000fc0) >> 6);
+      bytes[3] = 0x80 | (cp & 0x00003f);
+      return 4;
+    }
+
+  /* Invalid code point.  */
+  return -2;
+}
 
 static void
 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
 		  size_t nbytes, unsigned char *bytes,
 		  const char *from, const char *to,
-		  int decimal_ellipsis, int step)
+		  int decimal_ellipsis, int step, bool is_utf8)
 {
   hash_table *ht = &cm->char_table;
   hash_table *bt = &cm->byte_table;
@@ -1039,11 +1134,56 @@ hexadecimal range format should use only capital characters"));
   for (cnt = from_nr; cnt <= to_nr; cnt += step)
     {
       char *name_end;
+      unsigned char ubytes[4] = { '\0', '\0', '\0', '\0' };
       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
 		      prefix_len, from, len1 - prefix_len, cnt);
       obstack_1grow (ob, '\0');
       name_end = obstack_finish (ob);
 
+      /* Either we have a UTF-8 charmap, and we compute the bytes (see
+	 comment above), or we have a non-UTF-8 charmap and we follow
+	 POSIX rules as further below for incrementing the bytes in an
+	 ellipsis.  */
+      if (is_utf8)
+	{
+	  int nubytes;
+
+	  /* Directly convert the code point to the UTF-8 encoded bytes.  */
+	  nubytes = output_utf8_bytes (cnt, 4, ubytes);
+
+	  /* This should not happen, but we check for it just in case.  */
+	  if (nubytes == -1)
+	    lr_error (lr,
+		      _("not enough space to output UTF-8 encoding."));
+
+	  /* The other defect here could be that we have a mismatch
+	     between the code point and the encoded value or number of
+	     output bytes.  For example you specify U0000 but assign it
+	     an encoded value that is 3-bytes long (an error), or U0000
+	     is assigned a value of /x01.  */
+	  if (cnt == from_nr)
+	    {
+	      if (nubytes != nbytes)
+		lr_error (lr,
+			  _("encoding length does not match "
+			    "Unicode code point."));
+	      else
+		if (memcmp (bytes, ubytes, nbytes) != 0)
+		  lr_error (lr,
+			    _("encoded value does not match "
+			      "Unicode code point."));
+	    }
+
+	  /* The range does not cover one of the 4 UTF-8 code point ranges.  */
+	  if (nubytes == -2)
+	    lr_error (lr,
+		      _("invalid code point in the range."));
+
+	  /* Use the generated UTF-8 bytes.  */
+	  bytes = ubytes;
+	  nbytes = nubytes;
+	}
+
       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
       newp->nbytes = nbytes;
       memcpy (newp->bytes, bytes, nbytes);
@@ -1081,19 +1221,17 @@ hexadecimal range format should use only capital characters"));
       /* Please note we don't examine the return value since it is no error
 	 if we have two definitions for a symbol.  */
 
-      /* Increment the value in the byte sequence.  */
-      if (++bytes[nbytes - 1] == '\0')
-	{
-	  int b = nbytes - 2;
+      /* Increment the byte stream following POSIX rules.  */
+      if (!is_utf8)
+        bytes[nbytes - 1]++;
 
-	  do
-	    if (b < 0)
-	      {
-		lr_error (lr,
-			  _("resulting bytes for range not representable."));
-		return;
-	      }
-	  while (++bytes[b--] == 0);
+      /* If we overflowed then that generates a null byte which is an invalid
+	 specification according to POSIX and we issue a parser error.  */
+      if (bytes[nbytes - 1] == '\0')
+	{
+	  lr_error (lr,
+		    _("resulting bytes for range would contain null byte."));
+	  return;
 	}
     }
 }
-- 
2.26.3


  reply	other threads:[~2021-04-28 13:00 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-28 13:00 [PATCH v4 0/4] Add new C.UTF-8 locale (Bug 17318) Carlos O'Donell via Libc-alpha
2021-04-28 13:00 ` Carlos O'Donell via Libc-alpha [this message]
2021-04-29 14:11   ` [PATCH v4 1/4] Add support for processing wide ellipsis ranges in UTF-8 Florian Weimer via Libc-alpha
2021-04-28 13:00 ` [PATCH v4 2/4] Update UTF-8 charmap processing Carlos O'Donell via Libc-alpha
2021-04-29 14:07   ` Florian Weimer via Libc-alpha
2021-04-29 21:02     ` Carlos O'Donell via Libc-alpha
2021-04-30  4:18       ` Florian Weimer via Libc-alpha
2021-05-02 19:18         ` Carlos O'Donell via Libc-alpha
2021-04-28 13:00 ` [PATCH v4 3/4] Regenerate localedata files Carlos O'Donell via Libc-alpha
2021-04-29 21:03   ` Carlos O'Donell via Libc-alpha
2021-04-28 13:00 ` [PATCH v4 4/4] Add generic C.UTF-8 locale (Bug 17318) Carlos O'Donell via Libc-alpha
2021-04-29 14:13   ` Florian Weimer via Libc-alpha
2021-04-29 20:05     ` Carlos O'Donell via Libc-alpha
2021-04-30 17:59       ` Carlos O'Donell via Libc-alpha
2021-04-30 18:20         ` Florian Weimer via Libc-alpha
2021-05-02 19:18           ` Carlos O'Donell via Libc-alpha

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/libc/involved.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210428130033.3196848-2-carlos@redhat.com \
    --to=libc-alpha@sourceware.org \
    --cc=carlos@redhat.com \
    --cc=fweimer@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).