unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Florian Weimer via Libc-alpha <libc-alpha@sourceware.org>
To: Carlos O'Donell via Libc-alpha <libc-alpha@sourceware.org>
Subject: Re: [PATCH v4 1/3] Add support for locales with zero collation rules.
Date: Thu, 29 Jul 2021 11:44:20 +0200	[thread overview]
Message-ID: <8735rx4fpn.fsf@oldenburg.str.redhat.com> (raw)
In-Reply-To: <20210729063515.1541388-2-carlos@redhat.com> (Carlos O'Donell via Libc-alpha's message of "Thu, 29 Jul 2021 02:35:13 -0400")

* Carlos O'Donell via Libc-alpha:

> @@ -600,42 +597,51 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
>                      if (c == L_('-') && *p != L_(']'))
>                        {
>  #if _LIBC
> -                        /* We have to find the collation sequence
> -                           value for C.  Collation sequence is nothing
> -                           we can regularly access.  The sequence
> -                           value is defined by the order in which the
> -                           definitions of the collation values for the
> -                           various characters appear in the source
> -                           file.  A strange concept, nowhere
> -                           documented.  */
> -                        uint32_t fcollseq;
> -                        uint32_t lcollseq;
> +			/* We must find the collation sequence values for
> +			   the low part of the range, the high part of the
> +			   range and the searched value FN.  We do this by
> +			   using the POSIX concept of Collation Element
> +			   Ordering, which is the defined order of elements
> +			   in the source locale.  FCOLLSEQ is the searched
> +			   element in the range, while LCOLLSEQ is the low
> +			   element in the range.  If we have no collation
> +			   rules (nrules == 0) then we must fall back to a
> +			   basic code point value for the collation
> +			   sequence value (which is correct for ASCII and
> +			   UTF-8).  We must never use collseq if nrules ==
> +			   0 since none of the tables we need will be
> +			   present in the compiled binary locale.  We start
> +			   with fcollseq and lcollseq at unknown collation
> +			   sequences.  We only compute hcollseq, the high
> +			   part of the range if required.  */
> +                        uint32_t fcollseq = ~((uint32_t) 0);
> +                        uint32_t lcollseq = ~((uint32_t) 0);
>                          UCHAR cend = *p++;

Looks like ~((uint32_t) 0) needs to be added as a macro/constant to
__collseq_table_lookup.



>  
> +			if (nrules != 0)
> +			  {
>  # if WIDE_CHAR_VERSION
> +			    /* Search the collation data for the character.  */
> +			    fcollseq = __collseq_table_lookup (collseq, fn);
> +			    if (fcollseq == ~((uint32_t) 0))
> +			      /* We don't know anything about the character
> +				 we are supposed to match.  This means we are
> +				 failing.  */
> +			      goto range_not_matched;
> +
> +			    if (is_seqval)
> +			      lcollseq = cold;
> +			    else
> +			      lcollseq = __collseq_table_lookup (collseq, cold);
>  # else
> +			    fcollseq = collseq[fn];
> +			    lcollseq = is_seqval ? cold : collseq[(UCHAR) cold];
>  # endif
> +			  }
>  
>                          is_seqval = false;
>                          if (cend == L_('[') && *p == L_('.'))
>                            {
>                              const CHAR *startp = p;
>                              size_t c1 = 0;
>  
> @@ -752,14 +758,20 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
>                              cend = FOLD (cend);
>                            }
>  
> +			/* If we have rules, and the low sequence is lower than
> +			   the value of the searched sequence then we must
> +			   lookup the high collation sequence value and
> +			   determine if the fcollseq falls within the range.
> +			   If hcollseq is unknown then we could still match
> +			   fcollseq on the low end of the range.  If lcollseq
> +			   if unknown (0xffffffff) we will still fail to
> +			   match, but in the future we might consider matching
> +			   the high end of the range on an exact match.  */
> +                        if (nrules != 0 && (
>  # if WIDE_CHAR_VERSION
>                              lcollseq == 0xffffffff ||

This should use the same constant as the initialization.  The #if is now
unnecessary and can be removed.

>  # endif
> +                            lcollseq <= fcollseq))
>                            {
>                              /* We have to look at the upper bound.  */
>                              uint32_t hcollseq;
> @@ -789,6 +801,17 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
>                              if (lcollseq <= hcollseq && fcollseq <= hcollseq)
>                                goto matched;
>                            }
> +
> +			/* No rules, but we have a range.  */
> +			if (nrules == 0)
> +			  {
> +			    if (cend == L_('\0'))
> +			      return FNM_NOMATCH;
> +
> +			    /* Compare that fn is within the range.  */
> +			    if ((UCHAR) cold <= fn && fn <= cend)
> +			      goto matched;
> +

This part looks okay to me.

> diff --git a/posix/regcomp.c b/posix/regcomp.c
> index d93698ae78..f55d20cbfd 100644
> --- a/posix/regcomp.c
> +++ b/posix/regcomp.c
> @@ -2889,7 +2889,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
>  	  if (MB_CUR_MAX == 1)
>  	  */
>  	  if (nrules == 0)
> -	    return collseqmb[br_elem->opr.ch];
> +	    return br_elem->opr.ch;
>  	  else
>  	    {
>  	      wint_t wc = __btowc (br_elem->opr.ch);
> @@ -2900,6 +2900,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
>  	{
>  	  if (nrules != 0)
>  	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
> +	  else
> +	    return br_elem->opr.wch;
>  	}
>        else if (br_elem->type == COLL_SYM)
>  	{
> @@ -2935,7 +2937,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
>  		}
>  	    }
>  	  else if (sym_name_len == 1)
> -	    return collseqmb[br_elem->opr.name[0]];
> +	    return br_elem->opr.name[0];
>  	}
>        return UINT_MAX;
>      }
> @@ -3017,7 +3019,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
>  	  if (MB_CUR_MAX == 1)
>  	  */
>  	  if (nrules == 0)
> -	    ch_collseq = collseqmb[ch];
> +	    ch_collseq = ch;
>  	  else
>  	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
>  	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
> @@ -3103,11 +3105,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
>    int token_len;
>    bool first_round = true;
>  #ifdef _LIBC
> -  collseqmb = (const unsigned char *)
> -    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
>    nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
>    if (nrules)
>      {
> +      collseqmb = (const unsigned char *)
> +	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
>        /*
>        if (MB_CUR_MAX > 1)
>        */

These changes look good.

> diff --git a/posix/regexec.c b/posix/regexec.c
> index f7b4f9cfc3..6cc23831aa 100644
> --- a/posix/regexec.c
> +++ b/posix/regexec.c
> @@ -3858,62 +3858,53 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
>  }
>  
>  # ifdef _LIBC
> +#include <assert.h>

This really should got to the start of the file.

> +
>  static unsigned int
>  find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
>  {
> +  int32_t idx;
> +  const unsigned char *extra = (const unsigned char *)
>  	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
> +  int32_t extrasize = (const unsigned char *)
>  	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
> +  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
> +
> +  /* Only called from within 'if (nrules != 0)'.  */ 

Trailing whitespace on the last line.  Most of this is just
reindentation, so okay.

> +  assert (nrules != 0);

Right, the single caller is under nrules != 0.

strcoll and strxfrm are already correct.

Thanks,
Florian


  reply	other threads:[~2021-07-29  9:44 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-07-29  6:35 [PATCH v4 0/3] C.UTF-8 Carlos O'Donell via Libc-alpha
2021-07-29  6:35 ` [PATCH v4 1/3] Add support for locales with zero collation rules Carlos O'Donell via Libc-alpha
2021-07-29  9:44   ` Florian Weimer via Libc-alpha [this message]
2021-07-29 19:12     ` Paul Eggert
2021-07-29  6:35 ` [PATCH v4 2/3] Add 'strcmp_collation' support for LC_COLLATE Carlos O'Donell via Libc-alpha
2021-07-29  6:35 ` [PATCH v4 3/3] Add generic C.UTF-8 locale (Bug 17318) Carlos O'Donell via Libc-alpha
2021-07-29  7:53 ` [PATCH v4 0/3] C.UTF-8 Florian Weimer via Libc-alpha
2021-07-30  3:12   ` Carlos O'Donell via Libc-alpha
2021-08-18  8:12   ` Mike Frysinger via Libc-alpha

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/libc/involved.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8735rx4fpn.fsf@oldenburg.str.redhat.com \
    --to=libc-alpha@sourceware.org \
    --cc=fweimer@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).