We add a new C.UTF-8 locale. This locale is not builtin to glibc, but is provided as a distinct locale. The locale provides full support for UTF-8 and this includes full code point sorting via collation (excludes surrogates). Unfortuantely given the present implementation in glibc this results in 28MiB of LC_COLLATE data for all possible Unicode code points. Future improvements may reduce this size. Such improvements likely require a shortcut for the collation data that relies on C.UTF-8 single-byte sorting being equivalent to strcmp. The new locale is NOT added to SUPPORTED. Minimal test data for specific code points (minus those not supported by collate-test) is provided in C.UTF-8.in, and this verifies code point sorting is working reasonably across the range. The next step is to reduce LC_COLLATE to a manageable size before we enable the locale in SUPPORTED. Fully testing C.UTF-8 collation can add ~5-7 minutes to the locale testing (collate-test, and xfrm-test twice) so we don't enable full testing of all code points until we can parallelize the sort-test test. Testing sort-test with C.UTF-8 minimal test data passes cleanly. Tested on x86_64 or i686 without regression. --- localedata/C.UTF-8.in | 156 +++++++++++++++++++++++++++++++++++ localedata/Makefile | 2 + localedata/locales/C | 188 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 346 insertions(+) create mode 100644 localedata/C.UTF-8.in create mode 100644 localedata/locales/C diff --git a/localedata/C.UTF-8.in b/localedata/C.UTF-8.in new file mode 100644 index 0000000000..b8764a4e04 --- /dev/null +++ b/localedata/C.UTF-8.in @@ -0,0 +1,156 @@ + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; + ; +! ; +" ; +# ; +$ ; +% ; +& ; +' ; +) ; +* ; ++ ; +, ; +- ; +. ; +/ ; +0 ; +1 ; +2 ; +3 ; +4 ; +5 ; +6 ; +7 ; +8 ; +9 ; +< ; += ; +> ; +? ; +@ ; +A ; +B ; +C ; +D ; +E ; +F ; +G ; +H ; +I ; +J ; +K ; +L ; +M ; +N ; +O ; +P ; +Q ; +R ; +S ; +T ; +U ; +V ; +W ; +X ; +Y ; +Z ; +[ ; +\ ; +] ; +^ ; +_ ; +` ; +a ; +b ; +c ; +d ; +e ; +f ; +g ; +h ; +i ; +j ; +k ; +l ; +m ; +n ; +o ; +p ; +q ; +r ; +s ; +t ; +u ; +v ; +w ; +x ; +y ; +z ; +{ ; +| ; +} ; +~ ; + ; + ; +ÿ ; +Ā ; +࿿ ; +က ; + ; +𐀀 ; +🿿 ; +𠀀 ; +𯿿 ; +𰀀 ; +𿿾 ; +񀀀 ; +񏿿 ; +񐀀 ; +񟿿 ; +񠀀 ; +񯿿 ; +񰀀 ; +񿿿 ; +򀀀 ; +򏿿 ; +򐀀 ; +򟿿 ; +򠀀 ; +򯿿 ; +򰀀 ; +򿿿 ; +󀀁 ; +󏿌 ; +󐀎 ; +󟿿 ; +󠀁 ; +󯿿 ; +󰀁 ; +󿿿 ; +􀀁 ; +􏿿 ; diff --git a/localedata/Makefile b/localedata/Makefile index 14e04cd3c5..38017f2c4c 100644 --- a/localedata/Makefile +++ b/localedata/Makefile @@ -47,6 +47,7 @@ test-input := \ bg_BG.UTF-8 \ br_FR.UTF-8 \ bs_BA.UTF-8 \ + C.UTF-8 \ ckb_IQ.UTF-8 \ cmn_TW.UTF-8 \ crh_UA.UTF-8 \ @@ -206,6 +207,7 @@ LOCALES := \ bg_BG.UTF-8 \ br_FR.UTF-8 \ bs_BA.UTF-8 \ + C.UTF-8 \ ckb_IQ.UTF-8 \ cmn_TW.UTF-8 \ crh_UA.UTF-8 \ diff --git a/localedata/locales/C b/localedata/locales/C new file mode 100644 index 0000000000..67e5bd913b --- /dev/null +++ b/localedata/locales/C @@ -0,0 +1,188 @@ +escape_char / +comment_char % +% Locale for C locale in UTF-8 + +LC_IDENTIFICATION +title "C locale" +source "" +address "" +contact "" +email "bug-glibc-locales@gnu.org" +tel "" +fax "" +language "" +territory "" +revision "2.0" +date "2020-06-28" +category "i18n:2012";LC_IDENTIFICATION +category "i18n:2012";LC_CTYPE +category "i18n:2012";LC_COLLATE +category "i18n:2012";LC_TIME +category "i18n:2012";LC_NUMERIC +category "i18n:2012";LC_MONETARY +category "i18n:2012";LC_MESSAGES +category "i18n:2012";LC_PAPER +category "i18n:2012";LC_NAME +category "i18n:2012";LC_ADDRESS +category "i18n:2012";LC_TELEPHONE +category "i18n:2012";LC_MEASUREMENT +END LC_IDENTIFICATION + +LC_CTYPE + +% Include only the i18n character type classes without any of the +% transliteration that i18n uses by default. The C locale has no +% transliteration and passes all characters through unchanged. +copy "i18n_ctype" + +END LC_CTYPE + +% One rule, sort forward, for all Unicode scalar values to give +% code point order sorting for Unicode (excludes surrogates +% which are not in the UTF-8 character map). +LC_COLLATE +order_start forward + +.. + +% Exclude surrogates to from collation. + +.. + +UNDEFINED +order_end +END LC_COLLATE + +LC_MONETARY + +% This is the 14652 i18n fdcc-set definition for the LC_MONETARY +% category (except for the int_curr_symbol and currency_symbol, they are +% empty in the 14652 i18n fdcc-set definition and also empty in +% glibc/locale/C-monetary.c.). +int_curr_symbol "" +currency_symbol "" +mon_decimal_point "." +mon_thousands_sep "" +mon_grouping -1 +positive_sign "" +negative_sign "-" +int_frac_digits -1 +frac_digits -1 +p_cs_precedes -1 +int_p_sep_by_space -1 +p_sep_by_space -1 +n_cs_precedes -1 +int_n_sep_by_space -1 +n_sep_by_space -1 +p_sign_posn -1 +n_sign_posn -1 +% +END LC_MONETARY + +LC_NUMERIC +% This is the POSIX Locale definition for +% the LC_NUMERIC category. +% +decimal_point "." +thousands_sep "" +grouping -1 +END LC_NUMERIC + +LC_TIME +% This is the POSIX Locale definition for the LC_TIME category with the +% exception that time is per ISO 8601 and 24-hour. +% +% Abbreviated weekday names (%a) +abday "Sun";"Mon";"Tue";"Wed";"Thu";"Fri";"Sat" + +% Full weekday names (%A) +day "Sunday";"Monday";"Tuesday";"Wednesday";"Thursday";/ + "Friday";"Saturday" + +% Abbreviated month names (%b) +abmon "Jan";"Feb";"Mar";"Apr";"May";"Jun";"Jul";"Aug";"Sep";/ + "Oct";"Nov";"Dec" + +% Full month names (%B) +mon "January";"February";"March";"April";"May";"June";"July";/ + "August";"September";"October";"November";"December" + +% Week description, consists of three fields: +% 1. Number of days in a week. +% 2. Gregorian date that is a first weekday (19971130 for Sunday, 19971201 for Monday). +% 3. The weekday number to be contained in the first week of the year. +% +% ISO 8601 conforming applications should use the values 7, 19971201 (a +% Monday), and 4 (Thursday), respectively. +week 7;19971201;4 +first_weekday 1 +first_workday 1 + +% Appropriate date and time representation (%c) +d_t_fmt "%a %b %e %H:%M:%S %Y" + +% Appropriate date representation (%x) +d_fmt "%m/%d/%y" + +% Appropriate time representation (%X) +t_fmt "%H:%M:%S" + +% Appropriate AM/PM time representation (%r) +t_fmt_ampm "%I:%M:%S %p" + +% Equivalent of AM/PM (%p) +am_pm "AM";"PM" + +% Appropriate date representation (date(1)) "%a %b %e %H:%M:%S %Z %Y" +date_fmt "%a %b %e %H:%M:%S %Z %Y" +END LC_TIME + +LC_MESSAGES +% This is the POSIX Locale definition for +% the LC_NUMERIC category. +% +yesexpr "^[yY]" +noexpr "^[nN]" +yesstr "Yes" +nostr "No" +END LC_MESSAGES + +LC_PAPER +% This is the ISO/IEC 14652 "i18n" definition for +% the LC_PAPER category. +% (A4 paper, this is also used in the built in C/POSIX +% locale in glibc/locale/C-paper.c) +height 297 +width 210 +END LC_PAPER + +LC_NAME +% This is the ISO/IEC 14652 "i18n" definition for +% the LC_NAME category. +% (also used in the built in C/POSIX locale in glibc/locale/C-name.c) +name_fmt "%p%t%g%t%m%t%f" +END LC_NAME + +LC_ADDRESS +% This is the ISO/IEC 14652 "i18n" definition for +% the LC_ADDRESS category. +% (also used in the built in C/POSIX locale in glibc/locale/C-address.c) +postal_fmt "%a%N%f%N%d%N%b%N%s %h %e %r%N%C-%z %T%N%c%N" +END LC_ADDRESS + +LC_TELEPHONE +% This is the ISO/IEC 14652 "i18n" definition for +% the LC_TELEPHONE category. +% "+%c %a %l" +tel_int_fmt "+%c %a %l" +% (also used in the built in C/POSIX locale in glibc/locale/C-telephone.c) +END LC_TELEPHONE + +LC_MEASUREMENT +% This is the ISO/IEC 14652 "i18n" definition for +% the LC_MEASUREMENT category. +% (same as in the built in C/POSIX locale in glibc/locale/C-measurement.c) +%metric +measurement 1 +END LC_MEASUREMENT + -- 2.26.3