bug-gnulib@gnu.org mirror (unofficial)
 help / color / mirror / Atom feed
* [PATCH] Simplify and regularize regex use of ‘assert’
@ 2019-10-11 19:44 Paul Eggert
  2019-10-13 12:20 ` Bruno Haible
  0 siblings, 1 reply; 3+ messages in thread
From: Paul Eggert @ 2019-10-11 19:44 UTC (permalink / raw)
  To: bug-gnulib; +Cc: Paul Eggert

Also, tell GCC about the asserts even when compiling without
debugging, to give it further optimization opportunities.
* lib/regex_internal.h (DEBUG_ASSERT): New macro.
* lib/regcomp.c (link_nfa_nodes, calc_eclosure)
(parse_expression, parse_bracket_exp):
* lib/regex_internal.c (build_wcs_buffer)
(build_wcs_upper_buffer, re_string_reconstruct)
(re_string_context_at):
* lib/regexec.c (re_search_stub, re_copy_regs)
(re_search_internal, prune_impossible_nodes, check_matching)
(check_halt_state_context, set_regs, sift_states_backward)
(build_sifted_states, transit_state_mb, transit_state_bkref)
(check_arrival_add_next_nodes, check_arrival_expand_ecl)
(match_ctx_add_subtop):
Use it instead of plain ‘assert’.
---
 ChangeLog            | 19 ++++++++++++
 lib/regcomp.c        | 22 +++++--------
 lib/regex_internal.c | 13 +++-----
 lib/regex_internal.h |  8 ++++-
 lib/regexec.c        | 73 ++++++++++++++------------------------------
 5 files changed, 61 insertions(+), 74 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index e9e337666..0297bd069 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2019-10-11  Paul Eggert  <eggert@cs.ucla.edu>
+
+	Simplify and regularize regex use of ‘assert’
+	Also, tell GCC about the asserts even when compiling without
+	debugging, to give it further optimization opportunities.
+	* lib/regex_internal.h (DEBUG_ASSERT): New macro.
+	* lib/regcomp.c (link_nfa_nodes, calc_eclosure)
+	(parse_expression, parse_bracket_exp):
+	* lib/regex_internal.c (build_wcs_buffer)
+	(build_wcs_upper_buffer, re_string_reconstruct)
+	(re_string_context_at):
+	* lib/regexec.c (re_search_stub, re_copy_regs)
+	(re_search_internal, prune_impossible_nodes, check_matching)
+	(check_halt_state_context, set_regs, sift_states_backward)
+	(build_sifted_states, transit_state_mb, transit_state_bkref)
+	(check_arrival_add_next_nodes, check_arrival_expand_ecl)
+	(match_ctx_add_subtop):
+	Use it instead of plain ‘assert’.
+
 2019-10-09  Paul Eggert  <eggert@cs.ucla.edu>
 
 	regex: omit debug assignment when not debugging
diff --git a/lib/regcomp.c b/lib/regcomp.c
index c1f7f2b2a..3e8f1e610 100644
--- a/lib/regcomp.c
+++ b/lib/regcomp.c
@@ -1436,7 +1436,7 @@ link_nfa_nodes (void *extra, bin_tree_t *node)
       break;
 
     case END_OF_RE:
-      assert (node->next == NULL);
+      DEBUG_ASSERT (node->next == NULL);
       break;
 
     case OP_DUP_ASTERISK:
@@ -1452,8 +1452,8 @@ link_nfa_nodes (void *extra, bin_tree_t *node)
 	  right = node->right->first->node_idx;
 	else
 	  right = node->next->node_idx;
-	assert (left > -1);
-	assert (right > -1);
+	DEBUG_ASSERT (left > -1);
+	DEBUG_ASSERT (right > -1);
 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
       }
       break;
@@ -1471,7 +1471,7 @@ link_nfa_nodes (void *extra, bin_tree_t *node)
       break;
 
     default:
-      assert (!IS_EPSILON_NODE (node->token.type));
+      DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
       dfa->nexts[idx] = node->next->node_idx;
       break;
     }
@@ -1653,9 +1653,7 @@ calc_eclosure (re_dfa_t *dfa)
 {
   Idx node_idx;
   bool incomplete;
-#ifdef DEBUG
-  assert (dfa->nodes_len > 0);
-#endif
+  DEBUG_ASSERT (dfa->nodes_len > 0);
   incomplete = false;
   /* For each nodes, calculate epsilon closure.  */
   for (node_idx = 0; ; ++node_idx)
@@ -1670,9 +1668,7 @@ calc_eclosure (re_dfa_t *dfa)
 	  node_idx = 0;
 	}
 
-#ifdef DEBUG
-      assert (dfa->eclosures[node_idx].nelem != -1);
-#endif
+      DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
 
       /* If we have already calculated, skip it.  */
       if (dfa->eclosures[node_idx].nelem != 0)
@@ -2442,9 +2438,7 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
 
     default:
       /* Must not happen?  */
-#ifdef DEBUG
-      assert (0);
-#endif
+      DEBUG_ASSERT (false);
       return NULL;
     }
   fetch_token (token, regexp, syntax);
@@ -3306,7 +3300,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 	       goto parse_bracket_exp_free_return;
 	      break;
 	    default:
-	      assert (0);
+	      DEBUG_ASSERT (false);
 	      break;
 	    }
 	}
diff --git a/lib/regex_internal.c b/lib/regex_internal.c
index 99fbb26ec..6aa911608 100644
--- a/lib/regex_internal.c
+++ b/lib/regex_internal.c
@@ -212,7 +212,7 @@ build_wcs_buffer (re_string_t *pstr)
 {
 #ifdef _LIBC
   unsigned char buf[MB_LEN_MAX];
-  assert (MB_LEN_MAX >= pstr->mb_cur_max);
+  DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
 #else
   unsigned char buf[64];
 #endif
@@ -285,7 +285,7 @@ build_wcs_upper_buffer (re_string_t *pstr)
   size_t mbclen;
 #ifdef _LIBC
   char buf[MB_LEN_MAX];
-  assert (MB_LEN_MAX >= pstr->mb_cur_max);
+  DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
 #else
   char buf[64];
 #endif
@@ -685,9 +685,7 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 			 pstr->valid_len - offset);
 	      pstr->valid_len -= offset;
 	      pstr->valid_raw_len -= offset;
-#if defined DEBUG && DEBUG
-	      assert (pstr->valid_len > 0);
-#endif
+	      DEBUG_ASSERT (pstr->valid_len > 0);
 	    }
 	}
       else
@@ -941,10 +939,7 @@ re_string_context_at (const re_string_t *input, Idx idx, int eflags)
       Idx wc_idx = idx;
       while(input->wcs[wc_idx] == WEOF)
 	{
-#if defined DEBUG && DEBUG
-	  /* It must not happen.  */
-	  assert (wc_idx >= 0);
-#endif
+	  DEBUG_ASSERT (wc_idx >= 0);
 	  --wc_idx;
 	  if (wc_idx < 0)
 	    return input->tip_context;
diff --git a/lib/regex_internal.h b/lib/regex_internal.h
index b6eeba32d..06957f0fc 100644
--- a/lib/regex_internal.h
+++ b/lib/regex_internal.h
@@ -20,7 +20,6 @@
 #ifndef _REGEX_INTERNAL_H
 #define _REGEX_INTERNAL_H 1
 
-#include <assert.h>
 #include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -36,6 +35,13 @@
 #include <intprops.h>
 #include <verify.h>
 
+#if defined DEBUG && DEBUG != 0
+# include <assert.h>
+# define DEBUG_ASSERT(x) assert (x)
+#else
+# define DEBUG_ASSERT(x) assume (x)
+#endif
+
 #ifdef _LIBC
 # include <libc-lock.h>
 # define lock_define(name) __libc_lock_define (, name)
diff --git a/lib/regexec.c b/lib/regexec.c
index 809f89e26..3c46ac81d 100644
--- a/lib/regexec.c
+++ b/lib/regexec.c
@@ -443,7 +443,7 @@ re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
     {
       if (ret_len)
 	{
-	  assert (pmatch[0].rm_so == start);
+	  DEBUG_ASSERT (pmatch[0].rm_so == start);
 	  rval = pmatch[0].rm_eo - start;
 	}
       else
@@ -502,9 +502,9 @@ re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
     }
   else
     {
-      assert (regs_allocated == REGS_FIXED);
+      DEBUG_ASSERT (regs_allocated == REGS_FIXED);
       /* This function may not be called with REGS_FIXED and nregs too big.  */
-      assert (regs->num_regs >= nregs);
+      DEBUG_ASSERT (nregs <= regs->num_regs);
       rval = REGS_FIXED;
     }
 
@@ -613,10 +613,8 @@ re_search_internal (const regex_t *preg, const char *string, Idx length,
 			|| dfa->init_state_begbuf == NULL))
     return REG_NOMATCH;
 
-#ifdef DEBUG
   /* We assume front-end functions already check them.  */
-  assert (0 <= last_start && last_start <= length);
-#endif
+  DEBUG_ASSERT (0 <= last_start && last_start <= length);
 
   /* If initial states with non-begbuf contexts have no elements,
      the regex must be anchored.  If preg->newline_anchor is set,
@@ -817,9 +815,7 @@ re_search_internal (const regex_t *preg, const char *string, Idx length,
 		    break;
 		  if (__glibc_unlikely (err != REG_NOMATCH))
 		    goto free_return;
-#ifdef DEBUG
 		  match_last = -1;
-#endif
 		}
 	      else
 		break; /* We found a match.  */
@@ -829,10 +825,8 @@ re_search_internal (const regex_t *preg, const char *string, Idx length,
       match_ctx_clean (&mctx);
     }
 
-#ifdef DEBUG
-  assert (match_last != -1);
-  assert (err == REG_NOERROR);
-#endif
+  DEBUG_ASSERT (match_last != -1);
+  DEBUG_ASSERT (err == REG_NOERROR);
 
   /* Set pmatch[] if we need.  */
   if (nmatch > 0)
@@ -877,7 +871,7 @@ re_search_internal (const regex_t *preg, const char *string, Idx length,
 		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 	      }
 #else
-	    assert (mctx.input.offsets_needed == 0);
+	    DEBUG_ASSERT (mctx.input.offsets_needed == 0);
 #endif
 	    pmatch[reg_idx].rm_so += match_first;
 	    pmatch[reg_idx].rm_eo += match_first;
@@ -917,9 +911,7 @@ prune_impossible_nodes (re_match_context_t *mctx)
   re_dfastate_t **sifted_states;
   re_dfastate_t **lim_states = NULL;
   re_sift_context_t sctx;
-#ifdef DEBUG
-  assert (mctx->state_log != NULL);
-#endif
+  DEBUG_ASSERT (mctx->state_log != NULL);
   match_last = mctx->match_last;
   halt_node = mctx->last_node;
 
@@ -1065,7 +1057,7 @@ check_matching (re_match_context_t *mctx, bool fl_longest_match,
   /* An initial state must not be NULL (invalid).  */
   if (__glibc_unlikely (cur_state == NULL))
     {
-      assert (err == REG_ESPACE);
+      DEBUG_ASSERT (err == REG_ESPACE);
       return -2;
     }
 
@@ -1120,7 +1112,7 @@ check_matching (re_match_context_t *mctx, bool fl_longest_match,
 	  err = extend_buffers (mctx, next_char_idx + 1);
 	  if (__glibc_unlikely (err != REG_NOERROR))
 	    {
-	      assert (err == REG_ESPACE);
+	      DEBUG_ASSERT (err == REG_ESPACE);
 	      return -2;
 	    }
 	}
@@ -1203,9 +1195,7 @@ check_halt_state_context (const re_match_context_t *mctx,
 {
   Idx i;
   unsigned int context;
-#ifdef DEBUG
-  assert (state->halt);
-#endif
+  DEBUG_ASSERT (state->halt);
   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
   for (i = 0; i < state->nodes.nelem; ++i)
     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
@@ -1353,7 +1343,7 @@ pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
 		regmatch_t *regs, re_node_set *eps_via_nodes)
 {
   Idx num = --fs->num;
-  assert (num >= 0);
+  DEBUG_ASSERT (num >= 0);
   *pidx = fs->stack[num].idx;
   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
   re_node_set_free (eps_via_nodes);
@@ -1380,10 +1370,8 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
   regmatch_t *prev_idx_match;
   bool prev_idx_match_malloced = false;
 
-#ifdef DEBUG
-  assert (nmatch > 1);
-  assert (mctx->state_log != NULL);
-#endif
+  DEBUG_ASSERT (nmatch > 1);
+  DEBUG_ASSERT (mctx->state_log != NULL);
   if (fl_backtrack)
     {
       fs = &fs_body;
@@ -1569,9 +1557,7 @@ sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
   Idx str_idx = sctx->last_str_idx;
   re_node_set cur_dest;
 
-#ifdef DEBUG
-  assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
-#endif
+  DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
 
   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
      transit to the last_node and the last_node itself.  */
@@ -1639,11 +1625,8 @@ build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
       Idx prev_node = cur_src->elems[i];
       int naccepted = 0;
       bool ok;
+      DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
 
-#ifdef DEBUG
-      re_token_type_t type = dfa->nodes[prev_node].type;
-      assert (!IS_EPSILON_NODE (type));
-#endif
 #ifdef RE_ENABLE_I18N
       /* If the node may accept "multi byte".  */
       if (dfa->nodes[prev_node].accept_mb)
@@ -2496,9 +2479,7 @@ transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
       err = clean_state_log_if_needed (mctx, dest_idx);
       if (__glibc_unlikely (err != REG_NOERROR))
 	return err;
-#ifdef DEBUG
-      assert (dfa->nexts[cur_node_idx] != -1);
-#endif
+      DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
 
       dest_state = mctx->state_log[dest_idx];
@@ -2562,9 +2543,7 @@ transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
 
       /* And add the epsilon closures (which is 'new_dest_nodes') of
 	 the backreference to appropriate state_log.  */
-#ifdef DEBUG
-      assert (dfa->nexts[node_idx] != -1);
-#endif
+      DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
 	{
 	  Idx subexp_len;
@@ -3023,10 +3002,8 @@ check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
     {
       int naccepted = 0;
       Idx cur_node = cur_nodes->elems[cur_idx];
-#ifdef DEBUG
-      re_token_type_t type = dfa->nodes[cur_node].type;
-      assert (!IS_EPSILON_NODE (type));
-#endif
+      DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
+
 #ifdef RE_ENABLE_I18N
       /* If the node may accept "multi byte".  */
       if (dfa->nodes[cur_node].accept_mb)
@@ -3094,9 +3071,7 @@ check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
   reg_errcode_t err;
   Idx idx, outside_node;
   re_node_set new_nodes;
-#ifdef DEBUG
-  assert (cur_nodes->nelem);
-#endif
+  DEBUG_ASSERT (cur_nodes->nelem);
   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
   if (__glibc_unlikely (err != REG_NOERROR))
     return err;
@@ -4264,10 +4239,8 @@ static reg_errcode_t
 __attribute_warn_unused_result__
 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
 {
-#ifdef DEBUG
-  assert (mctx->sub_tops != NULL);
-  assert (mctx->asub_tops > 0);
-#endif
+  DEBUG_ASSERT (mctx->sub_tops != NULL);
+  DEBUG_ASSERT (mctx->asub_tops > 0);
   if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
     {
       Idx new_asub_tops = mctx->asub_tops * 2;
-- 
2.21.0



^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] Simplify and regularize regex use of ‘assert’
  2019-10-11 19:44 [PATCH] Simplify and regularize regex use of ‘assert’ Paul Eggert
@ 2019-10-13 12:20 ` Bruno Haible
  2019-10-13 20:18   ` Paul Eggert
  0 siblings, 1 reply; 3+ messages in thread
From: Bruno Haible @ 2019-10-13 12:20 UTC (permalink / raw)
  To: Paul Eggert; +Cc: bug-gnulib

Hi Paul,

> * lib/regex_internal.c (build_wcs_buffer)
> (build_wcs_upper_buffer, re_string_reconstruct)
> (re_string_context_at):

Note: Karl's autoupdate from glibc reverted this change.

Bruno



^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] Simplify and regularize regex use of ‘assert’
  2019-10-13 12:20 ` Bruno Haible
@ 2019-10-13 20:18   ` Paul Eggert
  0 siblings, 0 replies; 3+ messages in thread
From: Paul Eggert @ 2019-10-13 20:18 UTC (permalink / raw)
  To: Bruno Haible; +Cc: bug-gnulib

[-- Attachment #1: Type: text/plain, Size: 225 bytes --]

On 10/13/19 5:20 AM, Bruno Haible wrote:
> Karl's autoupdate from glibc reverted this change.

Thanks for mentioning that. I installed the attached to fix it. I do plan to 
migrate this back to glibc, but one step at a time.

[-- Attachment #2: 0001-config-srclist.txt-Remove-posix-regex_internal.c-for.patch --]
[-- Type: text/x-patch, Size: 2541 bytes --]

From 6cfb4302b3e1da14d706198b693558290e9b00f4 Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Sun, 13 Oct 2019 13:17:05 -0700
Subject: [PATCH] * config/srclist.txt: Remove posix/regex_internal.c for now.

---
 ChangeLog            |  4 ++++
 config/srclist.txt   |  2 +-
 lib/regex_internal.c | 13 ++++---------
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 877e275e7..d27674fa6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2019-10-13  Paul Eggert  <eggert@cs.ucla.edu>
+
+	* config/srclist.txt: Remove posix/regex_internal.c for now.
+
 2019-10-13  Bruno Haible  <bruno@clisp.org>
 
 	git-version-gen: Allow 'snapshot' as .tarball-version contents.
diff --git a/config/srclist.txt b/config/srclist.txt
index c53fb90a8..e001d15c3 100644
--- a/config/srclist.txt
+++ b/config/srclist.txt
@@ -55,7 +55,7 @@ $LIBCSRC malloc/scratch_buffer_set_array_size.c	lib/malloc
 #$LIBCSRC posix/regcomp.c		lib
 $LIBCSRC posix/regex.c			lib
 $LIBCSRC posix/regex.h			lib
-$LIBCSRC posix/regex_internal.c		lib
+#$LIBCSRC posix/regex_internal.c	lib
 #$LIBCSRC posix/regex_internal.h	lib
 #$LIBCSRC posix/regexec.c		lib
 $LIBCSRC time/timegm.c			lib
diff --git a/lib/regex_internal.c b/lib/regex_internal.c
index 99fbb26ec..6aa911608 100644
--- a/lib/regex_internal.c
+++ b/lib/regex_internal.c
@@ -212,7 +212,7 @@ build_wcs_buffer (re_string_t *pstr)
 {
 #ifdef _LIBC
   unsigned char buf[MB_LEN_MAX];
-  assert (MB_LEN_MAX >= pstr->mb_cur_max);
+  DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
 #else
   unsigned char buf[64];
 #endif
@@ -285,7 +285,7 @@ build_wcs_upper_buffer (re_string_t *pstr)
   size_t mbclen;
 #ifdef _LIBC
   char buf[MB_LEN_MAX];
-  assert (MB_LEN_MAX >= pstr->mb_cur_max);
+  DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
 #else
   char buf[64];
 #endif
@@ -685,9 +685,7 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 			 pstr->valid_len - offset);
 	      pstr->valid_len -= offset;
 	      pstr->valid_raw_len -= offset;
-#if defined DEBUG && DEBUG
-	      assert (pstr->valid_len > 0);
-#endif
+	      DEBUG_ASSERT (pstr->valid_len > 0);
 	    }
 	}
       else
@@ -941,10 +939,7 @@ re_string_context_at (const re_string_t *input, Idx idx, int eflags)
       Idx wc_idx = idx;
       while(input->wcs[wc_idx] == WEOF)
 	{
-#if defined DEBUG && DEBUG
-	  /* It must not happen.  */
-	  assert (wc_idx >= 0);
-#endif
+	  DEBUG_ASSERT (wc_idx >= 0);
 	  --wc_idx;
 	  if (wc_idx < 0)
 	    return input->tip_context;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2019-10-13 20:18 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-11 19:44 [PATCH] Simplify and regularize regex use of ‘assert’ Paul Eggert
2019-10-13 12:20 ` Bruno Haible
2019-10-13 20:18   ` Paul Eggert

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).