From: Florian Weimer via Libc-alpha <libc-alpha@sourceware.org>
To: Carlos O'Donell via Libc-alpha <libc-alpha@sourceware.org>
Subject: Re: [PATCH v4 1/3] Add support for locales with zero collation rules.
Date: Thu, 29 Jul 2021 11:44:20 +0200 [thread overview]
Message-ID: <8735rx4fpn.fsf@oldenburg.str.redhat.com> (raw)
In-Reply-To: <20210729063515.1541388-2-carlos@redhat.com> (Carlos O'Donell via Libc-alpha's message of "Thu, 29 Jul 2021 02:35:13 -0400")
* Carlos O'Donell via Libc-alpha:
> @@ -600,42 +597,51 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
> if (c == L_('-') && *p != L_(']'))
> {
> #if _LIBC
> - /* We have to find the collation sequence
> - value for C. Collation sequence is nothing
> - we can regularly access. The sequence
> - value is defined by the order in which the
> - definitions of the collation values for the
> - various characters appear in the source
> - file. A strange concept, nowhere
> - documented. */
> - uint32_t fcollseq;
> - uint32_t lcollseq;
> + /* We must find the collation sequence values for
> + the low part of the range, the high part of the
> + range and the searched value FN. We do this by
> + using the POSIX concept of Collation Element
> + Ordering, which is the defined order of elements
> + in the source locale. FCOLLSEQ is the searched
> + element in the range, while LCOLLSEQ is the low
> + element in the range. If we have no collation
> + rules (nrules == 0) then we must fall back to a
> + basic code point value for the collation
> + sequence value (which is correct for ASCII and
> + UTF-8). We must never use collseq if nrules ==
> + 0 since none of the tables we need will be
> + present in the compiled binary locale. We start
> + with fcollseq and lcollseq at unknown collation
> + sequences. We only compute hcollseq, the high
> + part of the range if required. */
> + uint32_t fcollseq = ~((uint32_t) 0);
> + uint32_t lcollseq = ~((uint32_t) 0);
> UCHAR cend = *p++;
Looks like ~((uint32_t) 0) needs to be added as a macro/constant to
__collseq_table_lookup.
>
> + if (nrules != 0)
> + {
> # if WIDE_CHAR_VERSION
> + /* Search the collation data for the character. */
> + fcollseq = __collseq_table_lookup (collseq, fn);
> + if (fcollseq == ~((uint32_t) 0))
> + /* We don't know anything about the character
> + we are supposed to match. This means we are
> + failing. */
> + goto range_not_matched;
> +
> + if (is_seqval)
> + lcollseq = cold;
> + else
> + lcollseq = __collseq_table_lookup (collseq, cold);
> # else
> + fcollseq = collseq[fn];
> + lcollseq = is_seqval ? cold : collseq[(UCHAR) cold];
> # endif
> + }
>
> is_seqval = false;
> if (cend == L_('[') && *p == L_('.'))
> {
> const CHAR *startp = p;
> size_t c1 = 0;
>
> @@ -752,14 +758,20 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
> cend = FOLD (cend);
> }
>
> + /* If we have rules, and the low sequence is lower than
> + the value of the searched sequence then we must
> + lookup the high collation sequence value and
> + determine if the fcollseq falls within the range.
> + If hcollseq is unknown then we could still match
> + fcollseq on the low end of the range. If lcollseq
> + if unknown (0xffffffff) we will still fail to
> + match, but in the future we might consider matching
> + the high end of the range on an exact match. */
> + if (nrules != 0 && (
> # if WIDE_CHAR_VERSION
> lcollseq == 0xffffffff ||
This should use the same constant as the initialization. The #if is now
unnecessary and can be removed.
> # endif
> + lcollseq <= fcollseq))
> {
> /* We have to look at the upper bound. */
> uint32_t hcollseq;
> @@ -789,6 +801,17 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
> if (lcollseq <= hcollseq && fcollseq <= hcollseq)
> goto matched;
> }
> +
> + /* No rules, but we have a range. */
> + if (nrules == 0)
> + {
> + if (cend == L_('\0'))
> + return FNM_NOMATCH;
> +
> + /* Compare that fn is within the range. */
> + if ((UCHAR) cold <= fn && fn <= cend)
> + goto matched;
> +
This part looks okay to me.
> diff --git a/posix/regcomp.c b/posix/regcomp.c
> index d93698ae78..f55d20cbfd 100644
> --- a/posix/regcomp.c
> +++ b/posix/regcomp.c
> @@ -2889,7 +2889,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
> if (MB_CUR_MAX == 1)
> */
> if (nrules == 0)
> - return collseqmb[br_elem->opr.ch];
> + return br_elem->opr.ch;
> else
> {
> wint_t wc = __btowc (br_elem->opr.ch);
> @@ -2900,6 +2900,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
> {
> if (nrules != 0)
> return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
> + else
> + return br_elem->opr.wch;
> }
> else if (br_elem->type == COLL_SYM)
> {
> @@ -2935,7 +2937,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
> }
> }
> else if (sym_name_len == 1)
> - return collseqmb[br_elem->opr.name[0]];
> + return br_elem->opr.name[0];
> }
> return UINT_MAX;
> }
> @@ -3017,7 +3019,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
> if (MB_CUR_MAX == 1)
> */
> if (nrules == 0)
> - ch_collseq = collseqmb[ch];
> + ch_collseq = ch;
> else
> ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
> if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
> @@ -3103,11 +3105,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
> int token_len;
> bool first_round = true;
> #ifdef _LIBC
> - collseqmb = (const unsigned char *)
> - _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
> nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
> if (nrules)
> {
> + collseqmb = (const unsigned char *)
> + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
> /*
> if (MB_CUR_MAX > 1)
> */
These changes look good.
> diff --git a/posix/regexec.c b/posix/regexec.c
> index f7b4f9cfc3..6cc23831aa 100644
> --- a/posix/regexec.c
> +++ b/posix/regexec.c
> @@ -3858,62 +3858,53 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
> }
>
> # ifdef _LIBC
> +#include <assert.h>
This really should got to the start of the file.
> +
> static unsigned int
> find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
> {
> + int32_t idx;
> + const unsigned char *extra = (const unsigned char *)
> _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
> + int32_t extrasize = (const unsigned char *)
> _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
> + uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
> +
> + /* Only called from within 'if (nrules != 0)'. */
Trailing whitespace on the last line. Most of this is just
reindentation, so okay.
> + assert (nrules != 0);
Right, the single caller is under nrules != 0.
strcoll and strxfrm are already correct.
Thanks,
Florian
next prev parent reply other threads:[~2021-07-29 9:44 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-07-29 6:35 [PATCH v4 0/3] C.UTF-8 Carlos O'Donell via Libc-alpha
2021-07-29 6:35 ` [PATCH v4 1/3] Add support for locales with zero collation rules Carlos O'Donell via Libc-alpha
2021-07-29 9:44 ` Florian Weimer via Libc-alpha [this message]
2021-07-29 19:12 ` Paul Eggert
2021-07-29 6:35 ` [PATCH v4 2/3] Add 'strcmp_collation' support for LC_COLLATE Carlos O'Donell via Libc-alpha
2021-07-29 6:35 ` [PATCH v4 3/3] Add generic C.UTF-8 locale (Bug 17318) Carlos O'Donell via Libc-alpha
2021-07-29 7:53 ` [PATCH v4 0/3] C.UTF-8 Florian Weimer via Libc-alpha
2021-07-30 3:12 ` Carlos O'Donell via Libc-alpha
2021-08-18 8:12 ` Mike Frysinger via Libc-alpha
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://www.gnu.org/software/libc/involved.html
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=8735rx4fpn.fsf@oldenburg.str.redhat.com \
--to=libc-alpha@sourceware.org \
--cc=fweimer@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).