bug-gnulib@gnu.org mirror (unofficial)
 help / color / mirror / Atom feed
* [PATCH] gen-uni-tables: Adjust `is_nonspacing`
@ 2024-02-11 19:02 Jules Bertholet
  2024-02-12 18:11 ` Bruno Haible
  0 siblings, 1 reply; 4+ messages in thread
From: Jules Bertholet @ 2024-02-11 19:02 UTC (permalink / raw)
  To: bug-gnulib; +Cc: Jules Bertholet

Makes two changes to the set of characters considered nonspacing:

- Makes `Prepended_Concatenation_Mark`s no longer nonspacing.
  This matches the Unicode spec (which specifies these as taking up space
  in front of the characters they modify), and also aligns with
  glibc `wcwidth()`.
- Makes `Default_Ignorable_Code_Point`s other than U+115F HANGUL CHOSEONG FILLER
  nonspacing. Unicode specifies (https://www.unicode.org/faq/unsup_char.html#3)
  that these "should be rendered as completely invisible (and non advancing, i.e.
  “zero width”), if not explicitly supported in rendering." U+115F is exempted
  because it is expected to be combined with other jamo to form a width-2 Hangul
  syllable block.

Signed-off-by: Jules Bertholet <julesbertholet@quoi.xyz>
---
 lib/gen-uni-tables.c  | 18 +++++++++++++++---
 lib/uniwidth/width0.h | 18 +++++++++---------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index b948489fbf..7c0de35be6 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -3105,6 +3105,13 @@ is_property_other_default_ignorable_code_point (unsigned int ch)
   return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
 }
 
+/* See PropList.txt, UCD.html.  */
+static bool
+is_property_prepended_concatenation_mark (unsigned int ch)
+{
+  return ((unicode_properties[ch] & (1ULL << PROP_PREPENDED_CONCATENATION_MARK)) != 0);
+}
+
 /* See PropList.txt, UCD.html.  */
 static bool
 is_property_deprecated (unsigned int ch)
@@ -6661,10 +6668,13 @@ fill_width (const char *width_filename)
 /* The non-spacing attribute table consists of:
    * Non-spacing characters; generated from PropList.txt or
      "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
-   * Format control characters; generated from
-     "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
+   * Format control characters except for `Prepended_Concatenation_Mark`s;
+     generated from "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt" and from
+     PropList.txt
    * Zero width characters; generated from
      "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
+   * `Default_Ignorable_Code_Point`s other than U+115F HANGUL CHOSEONG FILLER;
+     generated from DerivedCoreProperties.txt
    * Hangul Jamo characters that have conjoining behaviour:
        - jungseong = syllable-middle vowels
        - jongseong = syllable-final consonants
@@ -6687,8 +6697,10 @@ is_nonspacing (unsigned int ch)
 {
   return (unicode_attributes[ch].name != NULL
           && (get_bidi_category (ch) == UC_BIDI_NSM
-              || is_category_Cc (ch) || is_category_Cf (ch)
+              || is_category_Cc (ch)
+              || (is_category_Cf (ch) && !is_property_prepended_concatenation_mark (ch))
               || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0
+              || (is_property_default_ignorable_code_point (ch) && ch != 0x115F)
               || (ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6) /* jungseong */
               || (ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB) /* jongseong */
          )   );
diff --git a/lib/uniwidth/width0.h b/lib/uniwidth/width0.h
index 77954eb4d8..041c3c12b7 100644
--- a/lib/uniwidth/width0.h
+++ b/lib/uniwidth/width0.h
@@ -46,19 +46,19 @@ static const unsigned char nonspacing_table_data[48*64] = {
   0x00, 0x00, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xbf, /* 0x0580-0x05bf */
   0xb6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
   /* 0x0600-0x07ff */
-  0x3f, 0x00, 0xff, 0x17, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
+  0x00, 0x00, 0xff, 0x17, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
   0x00, 0xf8, 0xff, 0xff, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
-  0x00, 0x00, 0xc0, 0xbf, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
-  0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
+  0x00, 0x00, 0xc0, 0x9f, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
+  0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
   0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
   0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
   0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x0f, 0x20, /* 0x07c0-0x07ff */
   /* 0x0800-0x09ff */
   0x00, 0x00, 0xc0, 0xfb, 0xef, 0x3e, 0x00, 0x00, /* 0x0800-0x083f */
   0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
-  0x00, 0x00, 0x03, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
-  0x00, 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08c0-0x08ff */
+  0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
+  0x00, 0xfc, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xff, /* 0x08c0-0x08ff */
   0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, /* 0x0900-0x093f */
   0xfe, 0x21, 0xfe, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
   0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
@@ -168,7 +168,7 @@ static const unsigned char nonspacing_table_data[48*64] = {
   0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
+  0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
   /* 0xa600-0xa7ff */
@@ -223,7 +223,7 @@ static const unsigned char nonspacing_table_data[48*64] = {
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
+  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
   /* 0x10000-0x101ff */
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10000-0x1003f */
@@ -273,8 +273,8 @@ static const unsigned char nonspacing_table_data[48*64] = {
   /* 0x11000-0x111ff */
   0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, /* 0x11000-0x1103f */
   0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x19, 0x80, /* 0x11040-0x1107f */
-  0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x26, /* 0x11080-0x110bf */
-  0x04, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x110c0-0x110ff */
+  0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x06, /* 0x11080-0x110bf */
+  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x110c0-0x110ff */
   0x07, 0x00, 0x00, 0x00, 0x80, 0xef, 0x1f, 0x00, /* 0x11100-0x1113f */
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, /* 0x11140-0x1117f */
   0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x7f, /* 0x11180-0x111bf */
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] gen-uni-tables: Adjust `is_nonspacing`
  2024-02-11 19:02 [PATCH] gen-uni-tables: Adjust `is_nonspacing` Jules Bertholet
@ 2024-02-12 18:11 ` Bruno Haible
  2024-02-13  5:11   ` Jules Bertholet
  2024-02-13 16:43   ` Bruno Haible
  0 siblings, 2 replies; 4+ messages in thread
From: Bruno Haible @ 2024-02-12 18:11 UTC (permalink / raw)
  To: bug-gnulib; +Cc: Jules Bertholet

Hi,

Jules Bertholet wrote:
> Makes two changes to the set of characters considered nonspacing:
> 
> - Makes `Prepended_Concatenation_Mark`s no longer nonspacing.
>   This matches the Unicode spec (which specifies these as taking up space
>   in front of the characters they modify), and also aligns with
>   glibc `wcwidth()`.
> - Makes `Default_Ignorable_Code_Point`s other than U+115F HANGUL CHOSEONG FILLER
>   nonspacing. Unicode specifies (https://www.unicode.org/faq/unsup_char.html#3)
>   that these "should be rendered as completely invisible (and non advancing, i.e.
>   “zero width”), if not explicitly supported in rendering." U+115F is exempted
>   because it is expected to be combined with other jamo to form a width-2 Hangul
>   syllable block.

Thank you for the suggestions.

Regarding the Prepended_Concatenation_Mark characters, I agree, and I am making
the changes; see below.

Regarding the Default_Ignorable_Code_Point characters: Making all of them
non-spacing would assign width 0 to the characters
  U+115F HANGUL CHOSEONG FILLER
  U+3164 HANGUL FILLER
  U+FFA0 HALFWIDTH HANGUL FILLER
But this does not make sense to me:

  * You exclude U+115F from your consideration, but the justification is weak:
    Hangul composition of 3 characters in the range U+11xx creates a Hangul
    syllable, and widths don't add up: 1 + 1 + 1 != 2 in the general case.

  * The names of U+FFA0 being "HALFWIDTH HANGUL FILLER", it suggests that
    "HANGUL FILLER" traditionally has width 2 and "HALFWIDTH HANGUL FILLER"
    traditionally has width 1. If both had width 0, there would not be a need
    for the HALFWIDTH one.

  * glibc's wcwidth() function returns nonzero for these characters:
================================================================================
#define _GNU_SOURCE 1
#include <stdio.h>
#include <wchar.h>
#include <locale.h>

int main ()
{
  setlocale(LC_ALL,"");
  printf ("%d %d %d\n", wcwidth(0x115F), wcwidth(0x3164), wcwidth(0xFFA0));
  printf ("%d %d %d %d %d %d %d %d %d %d %d %d %d\n",
          wcwidth(0x0600), wcwidth(0x0601),  wcwidth(0x0602), wcwidth(0x0603),
          wcwidth(0x0604), wcwidth(0x0605),  wcwidth(0x06DD), wcwidth(0x070F),
          wcwidth(0x0890), wcwidth(0x0891),  wcwidth(0x08E2), wcwidth(0x110BD),
          wcwidth(0x110CD));
}
================================================================================
    produces:
    $ LC_ALL=en_US.UTF-8 ./a.out 
    2 2 1
    1 1 1 1 1 1 1 1 1 1 1 1 1

  * Your argument by an FAQ is weak, since FAQs typically tend to simplify
    things, so that they become easier to state or to understand.

Bruno


2024-02-12  Bruno Haible  <bruno@clisp.org>

	uniwidth/width: Assign width 1 to prepended concatenation marks.
	Suggested by Jules Bertholet <julesbertholet@quoi.xyz> in
	<https://lists.gnu.org/archive/html/bug-gnulib/2024-02/msg00093.html>.
	* lib/gen-uni-tables.c (is_nonspacing): For characters with property
	Prepended_Concatenation_Mark, return false instead of true.
	* lib/uniwidth/width0.h: Regenerated. This assigns width 1 to the
	characters U+0600..U+0605, U+06DD, U+070F, U+0890..U+0891, U+08E2,
	U+110BD, U+110CD.
	* modules/uniwidth/width (configure.ac): Bump required libunistring
	version.
	* modules/uniwidth/u8-width (configure.ac): Likewise.
	* modules/uniwidth/u8-strwidth (configure.ac): Likewise.
	* modules/uniwidth/u16-width (configure.ac): Likewise.
	* modules/uniwidth/u16-strwidth (configure.ac): Likewise.
	* modules/uniwidth/u32-width (configure.ac): Likewise.
	* modules/uniwidth/u32-strwidth (configure.ac): Likewise.

diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index bc228105b4..c73ce06d64 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -6669,8 +6669,13 @@ fill_width (const char *width_filename)
 /* The non-spacing attribute table consists of:
    * Non-spacing characters; generated from PropList.txt or
      "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
-   * Format control characters; generated from
-     "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
+   * Format control characters, except for characters with property
+     Prepended_Concatenation_Mark; generated from
+     "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt" and from
+     "grep Prepended_Concatenation_Mark PropList.txt".
+     Rationale for the Prepended_Concatenation_Mark exception:
+     The Unicode standard says "Unlike most other format characters,
+     however, they should be rendered with a visible glyph".
    * Zero width characters; generated from
      "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
    * Hangul Jamo characters that have conjoining behaviour:
@@ -6695,7 +6700,9 @@ is_nonspacing (unsigned int ch)
 {
   return (unicode_attributes[ch].name != NULL
           && (get_bidi_category (ch) == UC_BIDI_NSM
-              || is_category_Cc (ch) || is_category_Cf (ch)
+              || is_category_Cc (ch)
+              || (is_category_Cf (ch)
+                  && !is_property_prepended_concatenation_mark (ch))
               || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0
               || (ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6) /* jungseong */
               || (ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB) /* jongseong */
diff --git a/lib/uniwidth/width0.h b/lib/uniwidth/width0.h
index 77954eb4d8..6cc35536ad 100644
--- a/lib/uniwidth/width0.h
+++ b/lib/uniwidth/width0.h
@@ -46,19 +46,19 @@ static const unsigned char nonspacing_table_data[48*64] = {
   0x00, 0x00, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xbf, /* 0x0580-0x05bf */
   0xb6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
   /* 0x0600-0x07ff */
-  0x3f, 0x00, 0xff, 0x17, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
+  0x00, 0x00, 0xff, 0x17, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
   0x00, 0xf8, 0xff, 0xff, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
-  0x00, 0x00, 0xc0, 0xbf, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
-  0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
+  0x00, 0x00, 0xc0, 0x9f, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
+  0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
   0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
   0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
   0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x0f, 0x20, /* 0x07c0-0x07ff */
   /* 0x0800-0x09ff */
   0x00, 0x00, 0xc0, 0xfb, 0xef, 0x3e, 0x00, 0x00, /* 0x0800-0x083f */
   0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
-  0x00, 0x00, 0x03, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
-  0x00, 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08c0-0x08ff */
+  0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
+  0x00, 0xfc, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xff, /* 0x08c0-0x08ff */
   0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, /* 0x0900-0x093f */
   0xfe, 0x21, 0xfe, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
   0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
@@ -273,8 +273,8 @@ static const unsigned char nonspacing_table_data[48*64] = {
   /* 0x11000-0x111ff */
   0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, /* 0x11000-0x1103f */
   0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x19, 0x80, /* 0x11040-0x1107f */
-  0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x26, /* 0x11080-0x110bf */
-  0x04, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x110c0-0x110ff */
+  0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x06, /* 0x11080-0x110bf */
+  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x110c0-0x110ff */
   0x07, 0x00, 0x00, 0x00, 0x80, 0xef, 0x1f, 0x00, /* 0x11100-0x1113f */
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, /* 0x11140-0x1117f */
   0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x7f, /* 0x11180-0x111bf */
diff --git a/modules/uniwidth/u16-strwidth b/modules/uniwidth/u16-strwidth
index 1a4ea001e9..f7ceb9272c 100644
--- a/modules/uniwidth/u16-strwidth
+++ b/modules/uniwidth/u16-strwidth
@@ -10,7 +10,7 @@ uniwidth/u16-width
 unistr/u16-strlen
 
 configure.ac:
-gl_LIBUNISTRING_MODULE([1.1], [uniwidth/u16-strwidth])
+gl_LIBUNISTRING_MODULE([1.2], [uniwidth/u16-strwidth])
 
 Makefile.am:
 if LIBUNISTRING_COMPILE_UNIWIDTH_U16_STRWIDTH
diff --git a/modules/uniwidth/u16-width b/modules/uniwidth/u16-width
index 161898c93e..dfd08e3fec 100644
--- a/modules/uniwidth/u16-width
+++ b/modules/uniwidth/u16-width
@@ -10,7 +10,7 @@ uniwidth/width
 unistr/u16-mbtouc-unsafe
 
 configure.ac:
-gl_LIBUNISTRING_MODULE([1.1], [uniwidth/u16-width])
+gl_LIBUNISTRING_MODULE([1.2], [uniwidth/u16-width])
 
 Makefile.am:
 if LIBUNISTRING_COMPILE_UNIWIDTH_U16_WIDTH
diff --git a/modules/uniwidth/u32-strwidth b/modules/uniwidth/u32-strwidth
index 9c36df422a..a13836f1cc 100644
--- a/modules/uniwidth/u32-strwidth
+++ b/modules/uniwidth/u32-strwidth
@@ -10,7 +10,7 @@ uniwidth/u32-width
 unistr/u32-strlen
 
 configure.ac:
-gl_LIBUNISTRING_MODULE([1.1], [uniwidth/u32-strwidth])
+gl_LIBUNISTRING_MODULE([1.2], [uniwidth/u32-strwidth])
 
 Makefile.am:
 if LIBUNISTRING_COMPILE_UNIWIDTH_U32_STRWIDTH
diff --git a/modules/uniwidth/u32-width b/modules/uniwidth/u32-width
index 34f25fea76..d90d9b9a20 100644
--- a/modules/uniwidth/u32-width
+++ b/modules/uniwidth/u32-width
@@ -9,7 +9,7 @@ uniwidth/base
 uniwidth/width
 
 configure.ac:
-gl_LIBUNISTRING_MODULE([1.1], [uniwidth/u32-width])
+gl_LIBUNISTRING_MODULE([1.2], [uniwidth/u32-width])
 
 Makefile.am:
 if LIBUNISTRING_COMPILE_UNIWIDTH_U32_WIDTH
diff --git a/modules/uniwidth/u8-strwidth b/modules/uniwidth/u8-strwidth
index 303ee5c010..26857ae4b0 100644
--- a/modules/uniwidth/u8-strwidth
+++ b/modules/uniwidth/u8-strwidth
@@ -10,7 +10,7 @@ uniwidth/u8-width
 unistr/u8-strlen
 
 configure.ac:
-gl_LIBUNISTRING_MODULE([1.1], [uniwidth/u8-strwidth])
+gl_LIBUNISTRING_MODULE([1.2], [uniwidth/u8-strwidth])
 
 Makefile.am:
 if LIBUNISTRING_COMPILE_UNIWIDTH_U8_STRWIDTH
diff --git a/modules/uniwidth/u8-width b/modules/uniwidth/u8-width
index 36df3afe3f..46f5e4e014 100644
--- a/modules/uniwidth/u8-width
+++ b/modules/uniwidth/u8-width
@@ -10,7 +10,7 @@ uniwidth/width
 unistr/u8-mbtouc-unsafe
 
 configure.ac:
-gl_LIBUNISTRING_MODULE([1.1], [uniwidth/u8-width])
+gl_LIBUNISTRING_MODULE([1.2], [uniwidth/u8-width])
 
 Makefile.am:
 if LIBUNISTRING_COMPILE_UNIWIDTH_U8_WIDTH
diff --git a/modules/uniwidth/width b/modules/uniwidth/width
index 30973445b0..dc028317a1 100644
--- a/modules/uniwidth/width
+++ b/modules/uniwidth/width
@@ -13,7 +13,7 @@ uniwidth/base
 streq
 
 configure.ac:
-gl_LIBUNISTRING_MODULE([1.1], [uniwidth/width])
+gl_LIBUNISTRING_MODULE([1.2], [uniwidth/width])
 
 Makefile.am:
 if LIBUNISTRING_COMPILE_UNIWIDTH_WIDTH





^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] gen-uni-tables: Adjust `is_nonspacing`
  2024-02-12 18:11 ` Bruno Haible
@ 2024-02-13  5:11   ` Jules Bertholet
  2024-02-13 16:43   ` Bruno Haible
  1 sibling, 0 replies; 4+ messages in thread
From: Jules Bertholet @ 2024-02-13  5:11 UTC (permalink / raw)
  To: bug-gnulib; +Cc: Bruno Haible

Hello,

Thanks for your response.

Bruno Haible wrote:
> Regarding the Default_Ignorable_Code_Point characters: Making all of
> them
> non-spacing would assign width 0 to the characters
>   U+115F HANGUL CHOSEONG FILLER
>   U+3164 HANGUL FILLER
>   U+FFA0 HALFWIDTH HANGUL FILLER
> But this does not make sense to me:
> 
>   * You exclude U+115F from your consideration, but the justification
> is weak: Hangul composition of 3 characters in the range U+11xx
> creates a Hangul syllable, and widths don't add up: 1 + 1 + 1 != 2 in
> the general case.

The combining Hangul jamo characters are all assigned an
`East_Asian_Width` of `Wide` by Unicode, which would normally mean they
would all be assigned width 2; a combination of (leading choseong) +
(medial jungseong) + (trailing jongseong) would have width 2 + 2 + 2 =
6. However, this library (and glibc, and other wcwidth implementations)
special-cases jungseong and jongseong, assigning them all width 0, to
ensure that the complete block has width 2 + 0 + 0 = 2. Assigning
U+115F a width of 2 even though it has no visible display is necessary
to keep this scheme working.

You can read more about Unicode jamo in the Unicode spec, sections 3.12
<https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G24646> and
18.6 <https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G31028>.

>   * The names of U+FFA0 being "HALFWIDTH HANGUL FILLER", it suggests
> that "HANGUL FILLER" traditionally has width 2 and "HALFWIDTH HANGUL
> FILLER" traditionally has width 1. If both had width 0, there would
> not be a need for the HALFWIDTH one.

U+3164 HANGUL FILLER and U+FFA0 HALFWIDTH HANGUL FILLER are
compatibility characters that exist purely for interoperability with
legacy character sets. That's why their behavior may seem strange. I
have found some historical background on them here, though I can't be
sure it's fully accurate:
<https://github.com/jagracey/Awesome-Unicode/issues/4>

>   * glibc's wcwidth() function returns nonzero for these characters:

I've submitted a patch to glibc as well.

>   * Your argument by an FAQ is weak, since FAQs typically tend to
> simplify things, so that they become easier to state or to
> understand.

The Unicode Standard, version 15.0, §5.21 - Characters Ignored for
Display
<https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095> states
that all `Default_Ignorable_Code_Point`s should be "ignored for display
in fallback rendering", including "Hangul fillers". There is no
ambiguity here, and common rendering implementations treat them as
zero-width just like the spec says.

---

Since submitting this patch, I've noticed that §5.21 also highlights
another issue with gnulib (and glibc)'s width implementation:

"""
A small number of format characters (General_Category = Cf) are also
not given the Default_Ignorable_Code_Point property. This may surprise
implementers, who often assume that all format characters are generally
ignored in fallback display. The exact list of these exceptional format
characters can be found in the Unicode Character Database. There are,
however, three important sets of such format characters to note:

- prepended concatenation marks
- interlinear annotation characters
- Egyptian hieroglyph format controls

[...]

The other two notable sets of format characters that exceptionally are
not ignored in fallback display consist of the interlinear annotation
characters, U+FFF9 INTERLINEAR ANNOTATION ANCHOR through U+FFFB
INTERLINEAR ANNOTATION TERMINATOR, and the Egyptian hieroglyph format
controls, U+13430 EGYPTIAN HIEROGLYPH VERTICAL JOINER through U+1343F
EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE. These characters should have
a visible glyph display for fallback rendering, because if they are not
displayed, it is too easy to misread the resulting displayed text. See
“Annotation Characters” in Section 23.8, Specials[0], as well as
Section 11.4, Egyptian Hieroglyphs[1] for more discussion of the use
and display of these characters.

[0]:
https://www.unicode.org/versions/Unicode15.1.0/ch23.pdf#M9.21335.Heading.133.Specials
[1]:
https://www.unicode.org/versions/Unicode15.1.0/ch11.pdf#M9.73291.Heading.1418.Egyptian.Hieroglyphs
"""

There is no way for a terminal to realistically handle the interlinear
annotation anchors except via fallback rendering, so these should have
non-zero width also. As for the hieroglyph format controls, they appear
to not have wide support at present, so assuming fallback rendering
likely makes sense there as well. This would imply that we should
perhaps change the zero-width detection logic to not check whether a
character is a format control (Cf) at all, and only check for
`Default_Ignorable_Code_Point`s, categories Cc, Me, and Mn, and the
Hangul special cases.

Jules Bertholet


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] gen-uni-tables: Adjust `is_nonspacing`
  2024-02-12 18:11 ` Bruno Haible
  2024-02-13  5:11   ` Jules Bertholet
@ 2024-02-13 16:43   ` Bruno Haible
  1 sibling, 0 replies; 4+ messages in thread
From: Bruno Haible @ 2024-02-13 16:43 UTC (permalink / raw)
  To: bug-gnulib; +Cc: Jules Bertholet

> 2024-02-12  Bruno Haible  <bruno@clisp.org>
> 
> 	uniwidth/width: Assign width 1 to prepended concatenation marks.

Oops, I forgot to update the unit test. Fortunately there are various
continuous integrations running...


2024-02-13  Bruno Haible  <bruno@clisp.org>

	uniwidth/width tests: Update unit test for last commit.
	* tests/uniwidth/test-uc_width2.sh: Update expected test result.

diff --git a/tests/uniwidth/test-uc_width2.sh b/tests/uniwidth/test-uc_width2.sh
index ae6f8f4594..c46b3d07b6 100755
--- a/tests/uniwidth/test-uc_width2.sh
+++ b/tests/uniwidth/test-uc_width2.sh
@@ -27,9 +27,7 @@ cat > uc_width.ok <<\EOF
 05C4..05C5	0
 05C6		A
 05C7		0
-05C8..05FF	A
-0600..0605	0
-0606..060F	A
+05C8..060F	A
 0610..061A	0
 061B		A
 061C		0
@@ -38,16 +36,14 @@ cat > uc_width.ok <<\EOF
 0660..066F	A
 0670		0
 0671..06D5	A
-06D6..06DD	0
-06DE		A
+06D6..06DC	0
+06DD..06DE	A
 06DF..06E4	0
 06E5..06E6	A
 06E7..06E8	0
 06E9		A
 06EA..06ED	0
-06EE..070E	A
-070F		0
-0710		A
+06EE..0710	A
 0711		0
 0712..072F	A
 0730..074A	0
@@ -67,12 +63,12 @@ cat > uc_width.ok <<\EOF
 0829..082D	0
 082E..0858	A
 0859..085B	0
-085C..088F	A
-0890..0891	0
-0892..0897	A
+085C..0897	A
 0898..089F	0
 08A0..08C9	A
-08CA..0902	0
+08CA..08E1	0
+08E2		A
+08E3..0902	0
 0903..0939	A
 093A		0
 093B		A
@@ -580,13 +576,9 @@ FFFC..101FC	1
 110B3..110B6	0
 110B7..110B8	1
 110B9..110BA	0
-110BB..110BC	1
-110BD		0
-110BE..110C1	1
+110BB..110C1	1
 110C2		0
-110C3..110CC	1
-110CD		0
-110CE..110FF	1
+110C3..110FF	1
 11100..11102	0
 11103..11126	1
 11127..1112B	0





^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-02-13 16:43 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-11 19:02 [PATCH] gen-uni-tables: Adjust `is_nonspacing` Jules Bertholet
2024-02-12 18:11 ` Bruno Haible
2024-02-13  5:11   ` Jules Bertholet
2024-02-13 16:43   ` Bruno Haible

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).