From 9be236d67f3d78235c5cbe4381c5dd7b3cddb179 Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Thu, 9 Jan 2020 01:47:17 +0100 Subject: [PATCH 1/4] mbrtoc32: Use the system's mbrtoc32 if it exists and basically works. * m4/mbrtoc32.m4 (gl_MBRTOC32_SANITYCHECK): New macro. (gl_FUNC_MBRTOC32): Require it. Set REPLACE_MBRTOC32 if mbrtoc32 exists but is not working. * lib/mbrtoc32.c: Include hard-locale.h, . (mbrtoc32): If the char32_t encoding and the wchar_t encoding may differ, use the system's mbrtoc32, adding workarounds. * modules/mbrtoc32 (Depends-on): Add hard-locale. * doc/posix-functions/mbrtoc32.texi: Mention the Solaris and native Windows problem. * lib/btoc32.c: Include , . (btoc32): If the char32_t encoding and the wchar_t encoding may differ, use mbrtoc32, not btowc. * modules/btoc32 (Depends-on): Add mbrtoc32. * lib/mbsrtoc32s.c (mbsrtoc32s): If the char32_t encoding and the wchar_t encoding may differ, use mbrtoc32, not mbsrtowcs. * modules/mbsrtoc32s (Depends-on): Update conditions. (configure.ac): Compile mbsrtoc32s-state.c unconditionally. * lib/mbsnrtoc32s.c (mbsnrtoc32s): If the char32_t encoding and the wchar_t encoding may differ, use mbrtoc32, not mbsnrtowcs. * modules/mbsnrtoc32s (Depends-on): Update conditions. (configure.ac): Compile mbsrtoc32s-state.c unconditionally. --- ChangeLog | 25 ++++++++++ doc/posix-functions/mbrtoc32.texi | 4 ++ lib/btoc32.c | 20 ++++++++ lib/mbrtoc32.c | 53 ++++++++++++++------ lib/mbsnrtoc32s.c | 4 +- lib/mbsrtoc32s.c | 4 +- m4/mbrtoc32.m4 | 102 +++++++++++++++++++++++++++++++++++++- modules/btoc32 | 1 + modules/mbrtoc32 | 1 + modules/mbsnrtoc32s | 10 ++-- modules/mbsrtoc32s | 8 ++- 11 files changed, 204 insertions(+), 28 deletions(-) diff --git a/ChangeLog b/ChangeLog index ea35e7e..4b5a419 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +2020-01-08 Bruno Haible + + mbrtoc32: Use the system's mbrtoc32 if it exists and basically works. + * m4/mbrtoc32.m4 (gl_MBRTOC32_SANITYCHECK): New macro. + (gl_FUNC_MBRTOC32): Require it. Set REPLACE_MBRTOC32 if mbrtoc32 exists + but is not working. + * lib/mbrtoc32.c: Include hard-locale.h, . + (mbrtoc32): If the char32_t encoding and the wchar_t encoding may + differ, use the system's mbrtoc32, adding workarounds. + * modules/mbrtoc32 (Depends-on): Add hard-locale. + * doc/posix-functions/mbrtoc32.texi: Mention the Solaris and native + Windows problem. + * lib/btoc32.c: Include , . + (btoc32): If the char32_t encoding and the wchar_t encoding may differ, + use mbrtoc32, not btowc. + * modules/btoc32 (Depends-on): Add mbrtoc32. + * lib/mbsrtoc32s.c (mbsrtoc32s): If the char32_t encoding and the + wchar_t encoding may differ, use mbrtoc32, not mbsrtowcs. + * modules/mbsrtoc32s (Depends-on): Update conditions. + (configure.ac): Compile mbsrtoc32s-state.c unconditionally. + * lib/mbsnrtoc32s.c (mbsnrtoc32s): If the char32_t encoding and the + wchar_t encoding may differ, use mbrtoc32, not mbsnrtowcs. + * modules/mbsnrtoc32s (Depends-on): Update conditions. + (configure.ac): Compile mbsrtoc32s-state.c unconditionally. + 2020-01-07 Bruno Haible wcrtomb: Make multithread-safe, except possibly on IRIX. diff --git a/doc/posix-functions/mbrtoc32.texi b/doc/posix-functions/mbrtoc32.texi index 1aa15a3..9789bef 100644 --- a/doc/posix-functions/mbrtoc32.texi +++ b/doc/posix-functions/mbrtoc32.texi @@ -17,6 +17,10 @@ glibc 2.23. This function returns 0 instead of @code{(size_t) -2} when the input is empty: glibc 2.19. +@item +This function does not recognize multibyte sequences that @code{mbrtowc} +recognizes on some platforms: +Solaris 11.4, mingw, MSVC 14. @end itemize Portability problems not fixed by Gnulib: diff --git a/lib/btoc32.c b/lib/btoc32.c index 8b27875..d8ce087 100644 --- a/lib/btoc32.c +++ b/lib/btoc32.c @@ -21,10 +21,30 @@ /* Specification. */ #include +#include +#include + wint_t btoc32 (int c) { +#if HAVE_WORKING_MBRTOC32 && !defined __GLIBC__ + /* The char32_t encoding of a multibyte character may be different than its + wchar_t encoding. */ + if (c != EOF) + { + mbstate_t state; + char s[1]; + char32_t wc; + + memset (&state, '\0', sizeof (mbstate_t)); + s[0] = (unsigned char) c; + if (mbrtoc32 (&wc, s, 1, &state) <= 1) + return wc; + } + return WEOF; +#else /* In all known locale encodings, unibyte characters correspond only to characters in the BMP. */ return btowc (c); +#endif } diff --git a/lib/mbrtoc32.c b/lib/mbrtoc32.c index f2cf71e..facf28b 100644 --- a/lib/mbrtoc32.c +++ b/lib/mbrtoc32.c @@ -24,13 +24,13 @@ #include #include -# ifndef FALLTHROUGH -# if __GNUC__ < 7 -# define FALLTHROUGH ((void) 0) -# else -# define FALLTHROUGH __attribute__ ((__fallthrough__)) -# endif +#ifndef FALLTHROUGH +# if __GNUC__ < 7 +# define FALLTHROUGH ((void) 0) +# else +# define FALLTHROUGH __attribute__ ((__fallthrough__)) # endif +#endif #if GNULIB_defined_mbstate_t /* AIX, IRIX */ /* Implement mbrtoc32() on top of mbtowc() for the non-UTF-8 locales @@ -74,17 +74,23 @@ mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps) #else /* glibc, macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Cygwin, mingw, MSVC, Minix, Android */ -/* Implement mbrtoc32() based on mbrtowc(). */ +/* Implement mbrtoc32() based on the original mbrtoc32() or on mbrtowc(). */ # include # include "localcharset.h" # include "streq.h" +# if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ +# include "hard-locale.h" +# include +# endif + static mbstate_t internal_state; size_t mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps) +# undef mbrtoc32 { /* It's simpler to handle the case s == NULL upfront, than to worry about this case later, before every test of pwc and n. */ @@ -103,7 +109,31 @@ mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps) if (ps == NULL) ps = &internal_state; -# if _GL_LARGE_CHAR32_T +# if HAVE_WORKING_MBRTOC32 + /* mbrtoc32() may produce different values for wc than mbrtowc(). Therefore + use mbrtoc32(). */ + +# if defined _WIN32 && !defined __CYGWIN__ + char32_t wc; + size_t ret = mbrtoc32 (&wc, s, n, ps); + if (ret < (size_t) -2 && pwc != NULL) + *pwc = wc; +# else + size_t ret = mbrtoc32 (pwc, s, n, ps); +# endif + +# if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ + if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE)) + { + if (pwc != NULL) + *pwc = (unsigned char) *s; + return 1; + } +# endif + + return ret; + +# elif _GL_LARGE_CHAR32_T /* Special-case all encodings that may produce wide character values > WCHAR_MAX. */ @@ -209,12 +239,7 @@ mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps) # else - /* char32_t and wchar_t are equivalent. - Two implementations are possible: - - We can call the original mbrtoc32 (if it exists) and handle - MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ. - - We can call mbrtowc. - The latter is simpler. */ + /* char32_t and wchar_t are equivalent. Use mbrtowc(). */ wchar_t wc; size_t ret = mbrtowc (&wc, s, n, ps); if (ret < (size_t) -2 && pwc != NULL) diff --git a/lib/mbsnrtoc32s.c b/lib/mbsnrtoc32s.c index 7ba0415..c0f6e1f 100644 --- a/lib/mbsnrtoc32s.c +++ b/lib/mbsnrtoc32s.c @@ -22,7 +22,9 @@ #include -#if _GL_LARGE_CHAR32_T +#if (HAVE_WORKING_MBRTOC32 && !defined __GLIBC__) || _GL_LARGE_CHAR32_T +/* The char32_t encoding of a multibyte character may be different than its + wchar_t encoding, or char32_t is wider than wchar_t. */ /* For Cygwin >= 1.7 it would be possible to speed this up a bit by cutting the source into chunks, calling mbsnrtowcs on a chunk, then u16_to_u32 on diff --git a/lib/mbsrtoc32s.c b/lib/mbsrtoc32s.c index 432ffaf..8887ddf 100644 --- a/lib/mbsrtoc32s.c +++ b/lib/mbsrtoc32s.c @@ -22,7 +22,9 @@ #include -#if _GL_LARGE_CHAR32_T +#if (HAVE_WORKING_MBRTOC32 && !defined __GLIBC__) || _GL_LARGE_CHAR32_T +/* The char32_t encoding of a multibyte character may be different than its + wchar_t encoding, or char32_t is wider than wchar_t. */ # include # include diff --git a/m4/mbrtoc32.m4 b/m4/mbrtoc32.m4 index 5039fc7..3dee900 100644 --- a/m4/mbrtoc32.m4 +++ b/m4/mbrtoc32.m4 @@ -1,4 +1,4 @@ -# mbrtoc32.m4 serial 1 +# mbrtoc32.m4 serial 2 dnl Copyright (C) 2014-2020 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, @@ -11,6 +11,8 @@ AC_DEFUN([gl_FUNC_MBRTOC32], AC_REQUIRE([AC_TYPE_MBSTATE_T]) gl_MBSTATE_T_BROKEN + AC_REQUIRE([gl_MBRTOC32_SANITYCHECK]) + AC_CHECK_FUNCS_ONCE([mbrtoc32]) if test $ac_cv_func_mbrtoc32 = no; then HAVE_MBRTOC32=0 @@ -35,6 +37,9 @@ AC_DEFUN([gl_FUNC_MBRTOC32], ;; esac fi + if test $HAVE_WORKING_MBRTOC32 = 0; then + REPLACE_MBRTOC32=1 + fi fi ]) @@ -111,6 +116,101 @@ AC_DEFUN([gl_MBRTOC32_C_LOCALE], ]) ]) +dnl Test whether mbrtoc32 works not worse than mbrtowc. +dnl Result is HAVE_WORKING_MBRTOC32. + +AC_DEFUN([gl_MBRTOC32_SANITYCHECK], +[ + AC_REQUIRE([AC_PROG_CC]) + AC_CHECK_FUNCS_ONCE([mbrtoc32]) + AC_REQUIRE([gt_LOCALE_FR]) + AC_REQUIRE([gt_LOCALE_ZH_CN]) + AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles + if test $ac_cv_func_mbrtoc32 = no; then + HAVE_WORKING_MBRTOC32=0 + else + AC_CACHE_CHECK([whether mbrtoc32 works as well as mbrtowc], + [gl_cv_func_mbrtoc32_sanitycheck], + [ + dnl Initial guess, used when cross-compiling or when no suitable locale + dnl is present. +changequote(,)dnl + case "$host_os" in + # Guess no on Solaris, native Windows. + solaris* | mingw*) gl_cv_func_mbrtoc32_sanitycheck="guessing no" ;; + # Guess yes otherwise. + *) gl_cv_func_mbrtoc32_sanitycheck="guessing yes" ;; + esac +changequote([,])dnl + if test $LOCALE_FR != none || test $LOCALE_ZH_CN != none; then + AC_RUN_IFELSE( + [AC_LANG_SOURCE([[ +#include +#include +#include +/* Tru64 with Desktop Toolkit C has a bug: must be included before + . + BSD/OS 4.0.1 has a bug: , and must be + included before . */ +#include +#include +#include +#include +#include +int main () +{ + int result = 0; + /* This fails on native Windows: + mbrtoc32 returns (size_t)-1. + mbrtowc returns 1 (correct). */ + if (setlocale (LC_ALL, "$LOCALE_FR") != NULL) + { + mbstate_t state; + wchar_t wc = (wchar_t) 0xBADFACE; + memset (&state, '\0', sizeof (mbstate_t)); + if (mbrtowc (&wc, "\374", 1, &state) == 1) + { + char32_t c32 = (wchar_t) 0xBADFACE; + memset (&state, '\0', sizeof (mbstate_t)); + if (mbrtoc32 (&c32, "\374", 1, &state) != 1) + result |= 1; + } + } + /* This fails on Solaris 11.4: + mbrtoc32 returns (size_t)-1. + mbrtowc returns 4 (correct). */ + if (setlocale (LC_ALL, "$LOCALE_ZH_CN") != NULL) + { + mbstate_t state; + wchar_t wc = (wchar_t) 0xBADFACE; + memset (&state, '\0', sizeof (mbstate_t)); + if (mbrtowc (&wc, "\224\071\375\067", 4, &state) == 4) + { + char32_t c32 = (wchar_t) 0xBADFACE; + memset (&state, '\0', sizeof (mbstate_t)); + if (mbrtoc32 (&c32, "\224\071\375\067", 4, &state) != 4) + result |= 2; + } + } + return result; +}]])], + [gl_cv_func_mbrtoc32_sanitycheck=yes], + [gl_cv_func_mbrtoc32_sanitycheck=no], + [:]) + fi + ]) + case "$gl_cv_func_mbrtoc32_sanitycheck" in + *yes) + HAVE_WORKING_MBRTOC32=1 + AC_DEFINE([HAVE_WORKING_MBRTOC32], [1], + [Define if the mbrtoc32 function basically works.]) + ;; + *) HAVE_WORKING_MBRTOC32=0 ;; + esac + fi + AC_SUBST([HAVE_WORKING_MBRTOC32]) +]) + # Prerequisites of lib/mbrtoc32.c and lib/lc-charset-dispatch.c. AC_DEFUN([gl_PREREQ_MBRTOC32], [ : diff --git a/modules/btoc32 b/modules/btoc32 index 5e5d4a9..caf36d3 100644 --- a/modules/btoc32 +++ b/modules/btoc32 @@ -6,6 +6,7 @@ lib/btoc32.c Depends-on: uchar +mbrtoc32 btowc configure.ac: diff --git a/modules/mbrtoc32 b/modules/mbrtoc32 index 2575394..cf41846 100644 --- a/modules/mbrtoc32 +++ b/modules/mbrtoc32 @@ -18,6 +18,7 @@ m4/visibility.m4 Depends-on: uchar +hard-locale [{ test $HAVE_MBRTOC32 = 0 || test $REPLACE_MBRTOC32 = 1; } && test $REPLACE_MBSTATE_T = 0] mbrtowc [{ test $HAVE_MBRTOC32 = 0 || test $REPLACE_MBRTOC32 = 1; } && test $REPLACE_MBSTATE_T = 0] localcharset [test $HAVE_MBRTOC32 = 0 || test $REPLACE_MBRTOC32 = 1] streq [test $HAVE_MBRTOC32 = 0 || test $REPLACE_MBRTOC32 = 1] diff --git a/modules/mbsnrtoc32s b/modules/mbsnrtoc32s index 44784d8..ac464a8 100644 --- a/modules/mbsnrtoc32s +++ b/modules/mbsnrtoc32s @@ -10,16 +10,14 @@ Depends-on: uchar wchar verify -mbrtoc32 [test $SMALL_WCHAR_T = 1] -minmax [test $SMALL_WCHAR_T = 1] -strnlen1 [test $SMALL_WCHAR_T = 1] +mbrtoc32 +minmax +strnlen1 mbsnrtowcs [test $SMALL_WCHAR_T = 0] configure.ac: AC_REQUIRE([gl_UCHAR_H]) -if test $SMALL_WCHAR_T = 1; then - AC_LIBOBJ([mbsrtoc32s-state]) -fi +AC_LIBOBJ([mbsrtoc32s-state]) gl_UCHAR_MODULE_INDICATOR([mbsnrtoc32s]) Makefile.am: diff --git a/modules/mbsrtoc32s b/modules/mbsrtoc32s index e7e5ee2..64892cf 100644 --- a/modules/mbsrtoc32s +++ b/modules/mbsrtoc32s @@ -10,15 +10,13 @@ Depends-on: uchar wchar verify -mbrtoc32 [test $SMALL_WCHAR_T = 1] -strnlen1 [test $SMALL_WCHAR_T = 1] +mbrtoc32 +strnlen1 mbsrtowcs [test $SMALL_WCHAR_T = 0] configure.ac: AC_REQUIRE([gl_UCHAR_H]) -if test $SMALL_WCHAR_T = 1; then - AC_LIBOBJ([mbsrtoc32s-state]) -fi +AC_LIBOBJ([mbsrtoc32s-state]) gl_UCHAR_MODULE_INDICATOR([mbsrtoc32s]) Makefile.am: -- 2.7.4