* [PATCH 01/10] regex: improve comments
@ 2021-02-06 1:25 Paul Eggert
2021-02-06 1:25 ` [PATCH 02/10] regex: avoid undefined behavior Paul Eggert
` (8 more replies)
0 siblings, 9 replies; 13+ messages in thread
From: Paul Eggert @ 2021-02-06 1:25 UTC (permalink / raw)
To: bug-gnulib; +Cc: Paul Eggert
* lib/regexec.c: Add and correct comments about return values.
---
ChangeLog | 5 +++++
lib/regexec.c | 21 +++++++++++++--------
2 files changed, 18 insertions(+), 8 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 717e51b16..fef04a89c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2021-02-05 Paul Eggert <eggert@cs.ucla.edu>
+
+ regex: improve comments
+ * lib/regexec.c: Add and correct comments about return values.
+
2021-01-31 Bruno Haible <bruno@clisp.org>
relocatable-prog-wrapper: Tweak today's patch.
diff --git a/lib/regexec.c b/lib/regexec.c
index f7b4f9cfc..fa413dfd5 100644
--- a/lib/regexec.c
+++ b/lib/regexec.c
@@ -186,7 +186,8 @@ static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
REG_NOTBOL is set, then ^ does not match at the beginning of the
string; if REG_NOTEOL is set, then $ does not match at the end.
- We return 0 if we find a match and REG_NOMATCH if not. */
+ Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
+ EFLAGS is invalid. */
int
regexec (const regex_t *__restrict preg, const char *__restrict string,
@@ -269,8 +270,8 @@ compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
strings.)
On success, re_match* functions return the length of the match, re_search*
- return the position of the start of the match. Return value -1 means no
- match was found and -2 indicates an internal error. */
+ return the position of the start of the match. They return -1 on
+ match failure, -2 on error. */
regoff_t
re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
@@ -1206,7 +1207,7 @@ check_halt_state_context (const re_match_context_t *mctx,
/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
corresponding to the DFA).
Return the destination node, and update EPS_VIA_NODES;
- return -1 in case of errors. */
+ return -1 on match failure, -2 on error. */
static Idx
proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
@@ -2195,6 +2196,7 @@ sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
/* Return the next state to which the current state STATE will transit by
accepting the current input byte, and update STATE_LOG if necessary.
+ Return NULL on failure.
If STATE can accept a multibyte char/collating element/back reference
update the destination of STATE_LOG. */
@@ -2395,7 +2397,7 @@ check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
#if 0
/* Return the next state to which the current state STATE will transit by
- accepting the current input byte. */
+ accepting the current input byte. Return NULL on failure. */
static re_dfastate_t *
transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
@@ -2817,7 +2819,8 @@ find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
LAST_NODE at LAST_STR. We record the path onto PATH since it will be
heavily reused.
- Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
+ Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
+ REG_ESPACE if memory is exhausted. */
static reg_errcode_t
__attribute_warn_unused_result__
@@ -3433,7 +3436,8 @@ build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
/* Group all nodes belonging to STATE into several destinations.
Then for all destinations, set the nodes belonging to the destination
to DESTS_NODE[i] and set the characters accepted by the destination
- to DEST_CH[i]. This function return the number of destinations. */
+ to DEST_CH[i]. Return the number of destinations if successful,
+ -1 on internal error. */
static Idx
group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
@@ -4211,7 +4215,8 @@ match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
}
/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
- at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
+ at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
+ Return the new entry if successful, NULL if memory is exhausted. */
static re_sub_match_last_t *
match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
--
2.27.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 02/10] regex: avoid undefined behavior
2021-02-06 1:25 [PATCH 01/10] regex: improve comments Paul Eggert
@ 2021-02-06 1:25 ` Paul Eggert
2021-02-06 1:25 ` [PATCH 03/10] regex: minor refactoring Paul Eggert
` (7 subsequent siblings)
8 siblings, 0 replies; 13+ messages in thread
From: Paul Eggert @ 2021-02-06 1:25 UTC (permalink / raw)
To: bug-gnulib; +Cc: Paul Eggert
* lib/regexec.c (pop_fail_stack): If the stack is empty, return -1
instead of indulging in undefined behavior. This simplifies
callers, and avoids undefined behavior in some cases (see glibc
bug 11053, though this change does not fix that overall bug).
---
ChangeLog | 6 ++++++
lib/regexec.c | 30 ++++++++++++++----------------
2 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index fef04a89c..6fcd5819f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
2021-02-05 Paul Eggert <eggert@cs.ucla.edu>
+ regex: avoid undefined behavior
+ * lib/regexec.c (pop_fail_stack): If the stack is empty, return -1
+ instead of indulging in undefined behavior. This simplifies
+ callers, and avoids undefined behavior in some cases (see glibc
+ bug 11053, though this change does not fix that overall bug).
+
regex: improve comments
* lib/regexec.c: Add and correct comments about return values.
diff --git a/lib/regexec.c b/lib/regexec.c
index fa413dfd5..f982e3aba 100644
--- a/lib/regexec.c
+++ b/lib/regexec.c
@@ -1346,13 +1346,15 @@ static Idx
pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
regmatch_t *regs, re_node_set *eps_via_nodes)
{
+ if (fs == NULL || fs->num == 0)
+ return -1;
Idx num = --fs->num;
- DEBUG_ASSERT (num >= 0);
*pidx = fs->stack[num].idx;
memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
re_node_set_free (eps_via_nodes);
re_free (fs->stack[num].regs);
*eps_via_nodes = fs->stack[num].eps_via_nodes;
+ DEBUG_ASSERT (0 <= fs->stack[num].node);
return fs->stack[num].node;
}
@@ -1411,25 +1413,22 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
{
Idx reg_idx;
+ cur_node = -1;
if (fs)
{
for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
- break;
- if (reg_idx == nmatch)
- {
- re_node_set_free (&eps_via_nodes);
- regmatch_list_free (&prev_match);
- return free_fail_stack_return (fs);
- }
- cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
- &eps_via_nodes);
+ {
+ cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
+ &eps_via_nodes);
+ break;
+ }
}
- else
+ if (cur_node < 0)
{
re_node_set_free (&eps_via_nodes);
regmatch_list_free (&prev_match);
- return REG_NOERROR;
+ return free_fail_stack_return (fs);
}
}
@@ -1446,13 +1445,12 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
free_fail_stack_return (fs);
return REG_ESPACE;
}
- if (fs)
- cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
- &eps_via_nodes);
- else
+ cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, &eps_via_nodes);
+ if (cur_node < 0)
{
re_node_set_free (&eps_via_nodes);
regmatch_list_free (&prev_match);
+ free_fail_stack_return (fs);
return REG_NOMATCH;
}
}
--
2.27.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 03/10] regex: minor refactoring
2021-02-06 1:25 [PATCH 01/10] regex: improve comments Paul Eggert
2021-02-06 1:25 ` [PATCH 02/10] regex: avoid undefined behavior Paul Eggert
@ 2021-02-06 1:25 ` Paul Eggert
2021-02-06 1:25 ` [PATCH 04/10] regex: make it easier to merge into glibc Paul Eggert
` (6 subsequent siblings)
8 siblings, 0 replies; 13+ messages in thread
From: Paul Eggert @ 2021-02-06 1:25 UTC (permalink / raw)
To: bug-gnulib; +Cc: Paul Eggert
* lib/regexec.c (proceed_next_node): Use more-local decls.
---
ChangeLog | 3 +++
lib/regexec.c | 14 ++++++--------
2 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 6fcd5819f..fdc107673 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
2021-02-05 Paul Eggert <eggert@cs.ucla.edu>
+ regex: minor refactoring
+ * lib/regexec.c (proceed_next_node): Use more-local decls.
+
regex: avoid undefined behavior
* lib/regexec.c (pop_fail_stack): If the stack is empty, return -1
instead of indulging in undefined behavior. This simplifies
diff --git a/lib/regexec.c b/lib/regexec.c
index f982e3aba..fdd2e373e 100644
--- a/lib/regexec.c
+++ b/lib/regexec.c
@@ -1215,19 +1215,17 @@ proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
struct re_fail_stack_t *fs)
{
const re_dfa_t *const dfa = mctx->dfa;
- Idx i;
- bool ok;
if (IS_EPSILON_NODE (dfa->nodes[node].type))
{
re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
re_node_set *edests = &dfa->edests[node];
- Idx dest_node;
- ok = re_node_set_insert (eps_via_nodes, node);
+ bool ok = re_node_set_insert (eps_via_nodes, node);
if (__glibc_unlikely (! ok))
return -2;
- /* Pick up a valid destination, or return -1 if none
- is found. */
- for (dest_node = -1, i = 0; i < edests->nelem; ++i)
+
+ /* Pick a valid destination, or return -1 if none is found. */
+ Idx dest_node = -1;
+ for (Idx i = 0; i < edests->nelem; i++)
{
Idx candidate = edests->elems[i];
if (!re_node_set_contains (cur_nodes, candidate))
@@ -1289,7 +1287,7 @@ proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
if (naccepted == 0)
{
Idx dest_node;
- ok = re_node_set_insert (eps_via_nodes, node);
+ bool ok = re_node_set_insert (eps_via_nodes, node);
if (__glibc_unlikely (! ok))
return -2;
dest_node = dfa->edests[node].elems[0];
--
2.27.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 04/10] regex: make it easier to merge into glibc
2021-02-06 1:25 [PATCH 01/10] regex: improve comments Paul Eggert
2021-02-06 1:25 ` [PATCH 02/10] regex: avoid undefined behavior Paul Eggert
2021-02-06 1:25 ` [PATCH 03/10] regex: minor refactoring Paul Eggert
@ 2021-02-06 1:25 ` Paul Eggert
2021-02-06 1:25 ` [PATCH 05/10] regex-tests: fix typo Paul Eggert
` (5 subsequent siblings)
8 siblings, 0 replies; 13+ messages in thread
From: Paul Eggert @ 2021-02-06 1:25 UTC (permalink / raw)
To: bug-gnulib; +Cc: Paul Eggert
* lib/regex_internal.h [_LIBC]: Do not include Gnulib’s dynarray.h.
---
ChangeLog | 3 +++
lib/regex_internal.h | 5 ++++-
2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/ChangeLog b/ChangeLog
index fdc107673..82aa61a04 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
2021-02-05 Paul Eggert <eggert@cs.ucla.edu>
+ regex: make it easier to merge into glibc
+ * lib/regex_internal.h [_LIBC]: Do not include Gnulib’s dynarray.h.
+
regex: minor refactoring
* lib/regexec.c (proceed_next_node): Use more-local decls.
diff --git a/lib/regex_internal.h b/lib/regex_internal.h
index 3fa2bf1aa..4b0a3efb6 100644
--- a/lib/regex_internal.h
+++ b/lib/regex_internal.h
@@ -32,7 +32,10 @@
#include <stdbool.h>
#include <stdint.h>
-#include <dynarray.h>
+#ifndef _LIBC
+# include <dynarray.h>
+#endif
+
#include <intprops.h>
#include <verify.h>
--
2.27.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 05/10] regex-tests: fix typo
2021-02-06 1:25 [PATCH 01/10] regex: improve comments Paul Eggert
` (2 preceding siblings ...)
2021-02-06 1:25 ` [PATCH 04/10] regex: make it easier to merge into glibc Paul Eggert
@ 2021-02-06 1:25 ` Paul Eggert
2021-02-06 1:25 ` [PATCH 06/10] regex: avoid duplicate in espilon closure Paul Eggert
` (4 subsequent siblings)
8 siblings, 0 replies; 13+ messages in thread
From: Paul Eggert @ 2021-02-06 1:25 UTC (permalink / raw)
To: bug-gnulib; +Cc: Paul Eggert
* tests/test-regex.c (main): Fix typo that would have caused an
old test case to report incorrect values on failure.
---
ChangeLog | 4 ++++
tests/test-regex.c | 3 ++-
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/ChangeLog b/ChangeLog
index 82aa61a04..f4583e0af 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
2021-02-05 Paul Eggert <eggert@cs.ucla.edu>
+ regex-tests: fix typo
+ * tests/test-regex.c (main): Fix typo that would have caused an
+ old test case to report incorrect values on failure.
+
regex: make it easier to merge into glibc
* lib/regex_internal.h [_LIBC]: Do not include Gnulib’s dynarray.h.
diff --git a/tests/test-regex.c b/tests/test-regex.c
index 4a4ff39fd..58f8200f2 100644
--- a/tests/test-regex.c
+++ b/tests/test-regex.c
@@ -290,7 +290,8 @@ main (void)
{
memset (®s, 0, sizeof regs);
static char const data[] = "WXY";
- if (re_search (®ex, data, sizeof data - 1, 0, 3, ®s) < 0)
+ i = re_search (®ex, data, sizeof data - 1, 0, 3, ®s);
+ if (i < 0)
report_error ("re_search '%s' on '%s' returned %d", pat_x, data, i);
regfree (®ex);
free (regs.start);
--
2.27.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 06/10] regex: avoid duplicate in espilon closure
2021-02-06 1:25 [PATCH 01/10] regex: improve comments Paul Eggert
` (3 preceding siblings ...)
2021-02-06 1:25 ` [PATCH 05/10] regex-tests: fix typo Paul Eggert
@ 2021-02-06 1:25 ` Paul Eggert
2021-02-06 1:25 ` [PATCH 07/10] regex: fix longstanding backref match bug Paul Eggert
` (3 subsequent siblings)
8 siblings, 0 replies; 13+ messages in thread
From: Paul Eggert @ 2021-02-06 1:25 UTC (permalink / raw)
To: bug-gnulib; +Cc: Paul Eggert
* lib/regcomp.c (calc_eclosure_iter): Insert NODE into epsilon
closure first rather than last. Otherwise, the epsilon closure
might contain a duplicate of NODE.
---
ChangeLog | 5 +++++
lib/regcomp.c | 8 +++-----
2 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index f4583e0af..74304474b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
2021-02-05 Paul Eggert <eggert@cs.ucla.edu>
+ regex: avoid duplicate in espilon closure
+ * lib/regcomp.c (calc_eclosure_iter): Insert NODE into epsilon
+ closure first rather than last. Otherwise, the epsilon closure
+ might contain a duplicate of NODE.
+
regex-tests: fix typo
* tests/test-regex.c (main): Fix typo that would have caused an
old test case to report incorrect values on failure.
diff --git a/lib/regcomp.c b/lib/regcomp.c
index d93698ae7..887e5b506 100644
--- a/lib/regcomp.c
+++ b/lib/regcomp.c
@@ -1695,12 +1695,14 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
reg_errcode_t err;
Idx i;
re_node_set eclosure;
- bool ok;
bool incomplete = false;
err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
if (__glibc_unlikely (err != REG_NOERROR))
return err;
+ /* An epsilon closure includes itself. */
+ eclosure.elems[eclosure.nelem++] = node;
+
/* This indicates that we are calculating this node now.
We reference this value to avoid infinite loop. */
dfa->eclosures[node].nelem = -1;
@@ -1753,10 +1755,6 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
}
}
- /* An epsilon closure includes itself. */
- ok = re_node_set_insert (&eclosure, node);
- if (__glibc_unlikely (! ok))
- return REG_ESPACE;
if (incomplete && !root)
dfa->eclosures[node].nelem = 0;
else
--
2.27.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 07/10] regex: fix longstanding backref match bug
2021-02-06 1:25 [PATCH 01/10] regex: improve comments Paul Eggert
` (4 preceding siblings ...)
2021-02-06 1:25 ` [PATCH 06/10] regex: avoid duplicate in espilon closure Paul Eggert
@ 2021-02-06 1:25 ` Paul Eggert
2021-02-09 19:42 ` Adhemerval Zanella
2021-04-16 21:37 ` Dmitry V. Levin
2021-02-06 1:26 ` [PATCH 08/10] regex: debug check for set member duplicates Paul Eggert
` (2 subsequent siblings)
8 siblings, 2 replies; 13+ messages in thread
From: Paul Eggert @ 2021-02-06 1:25 UTC (permalink / raw)
To: bug-gnulib; +Cc: Paul Eggert
This fixes a longstanding glibc bug concerning backreferences
<https://sourceware.org/11053> (2009-12-04).
* lib/regexec.c (proceed_next_node, push_fail_stack)
(pop_fail_stack): Push and pop the previous registers
as well as the current ones. All callers changed.
(set_regs): Also pop if CUR_NODE has already been checked,
so that it does not get added as a duplicate set entry.
(update_regs): Fix comment location.
* tests/test-regex.c (tests): New constant.
(bug_regex11): New test function.
(main): Bump alarm value. Call new test function.
---
ChangeLog | 13 ++++++
lib/regexec.c | 26 +++++++----
tests/test-regex.c | 113 ++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 141 insertions(+), 11 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 74304474b..bd7d1fa16 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,18 @@
2021-02-05 Paul Eggert <eggert@cs.ucla.edu>
+ regex: fix longstanding backref match bug
+ This fixes a longstanding glibc bug concerning backreferences
+ <https://sourceware.org/11053> (2009-12-04).
+ * lib/regexec.c (proceed_next_node, push_fail_stack)
+ (pop_fail_stack): Push and pop the previous registers
+ as well as the current ones. All callers changed.
+ (set_regs): Also pop if CUR_NODE has already been checked,
+ so that it does not get added as a duplicate set entry.
+ (update_regs): Fix comment location.
+ * tests/test-regex.c (tests): New constant.
+ (bug_regex11): New test function.
+ (main): Bump alarm value. Call new test function.
+
regex: avoid duplicate in espilon closure
* lib/regcomp.c (calc_eclosure_iter): Insert NODE into epsilon
closure first rather than last. Otherwise, the epsilon closure
diff --git a/lib/regexec.c b/lib/regexec.c
index fdd2e373e..424bc8d15 100644
--- a/lib/regexec.c
+++ b/lib/regexec.c
@@ -59,7 +59,7 @@ static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
Idx cur_idx, Idx nmatch);
static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
Idx str_idx, Idx dest_node, Idx nregs,
- regmatch_t *regs,
+ regmatch_t *regs, regmatch_t *prevregs,
re_node_set *eps_via_nodes);
static reg_errcode_t set_regs (const regex_t *preg,
const re_match_context_t *mctx,
@@ -1211,6 +1211,7 @@ check_halt_state_context (const re_match_context_t *mctx,
static Idx
proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
+ regmatch_t *prevregs,
Idx *pidx, Idx node, re_node_set *eps_via_nodes,
struct re_fail_stack_t *fs)
{
@@ -1243,7 +1244,7 @@ proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
/* Otherwise, push the second epsilon-transition on the fail stack. */
else if (fs != NULL
&& push_fail_stack (fs, *pidx, candidate, nregs, regs,
- eps_via_nodes))
+ prevregs, eps_via_nodes))
return -2;
/* We know we are going to exit. */
@@ -1316,7 +1317,8 @@ proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
static reg_errcode_t
__attribute_warn_unused_result__
push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
- Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
+ Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
+ re_node_set *eps_via_nodes)
{
reg_errcode_t err;
Idx num = fs->num++;
@@ -1332,23 +1334,26 @@ push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
}
fs->stack[num].idx = str_idx;
fs->stack[num].node = dest_node;
- fs->stack[num].regs = re_malloc (regmatch_t, nregs);
+ fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
if (fs->stack[num].regs == NULL)
return REG_ESPACE;
memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
+ memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
return err;
}
static Idx
pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
- regmatch_t *regs, re_node_set *eps_via_nodes)
+ regmatch_t *regs, regmatch_t *prevregs,
+ re_node_set *eps_via_nodes)
{
if (fs == NULL || fs->num == 0)
return -1;
Idx num = --fs->num;
*pidx = fs->stack[num].idx;
memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
+ memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
re_node_set_free (eps_via_nodes);
re_free (fs->stack[num].regs);
*eps_via_nodes = fs->stack[num].eps_via_nodes;
@@ -1408,7 +1413,8 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
{
update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
- if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
+ if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
+ || re_node_set_contains (&eps_via_nodes, cur_node))
{
Idx reg_idx;
cur_node = -1;
@@ -1418,7 +1424,7 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
{
cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
- &eps_via_nodes);
+ prev_idx_match, &eps_via_nodes);
break;
}
}
@@ -1431,7 +1437,8 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
}
/* Proceed to next node. */
- cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
+ cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
+ &idx, cur_node,
&eps_via_nodes, fs);
if (__glibc_unlikely (cur_node < 0))
@@ -1443,7 +1450,8 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
free_fail_stack_return (fs);
return REG_ESPACE;
}
- cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, &eps_via_nodes);
+ cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
+ prev_idx_match, &eps_via_nodes);
if (cur_node < 0)
{
re_node_set_free (&eps_via_nodes);
diff --git a/tests/test-regex.c b/tests/test-regex.c
index 58f8200f2..a14619805 100644
--- a/tests/test-regex.c
+++ b/tests/test-regex.c
@@ -55,6 +55,111 @@ really_utf8 (void)
return strcmp (locale_charset (), "UTF-8") == 0;
}
+/* Tests supposed to match; copied from glibc posix/bug-regex11.c. */
+static struct
+{
+ const char *pattern;
+ const char *string;
+ int flags, nmatch;
+ regmatch_t rm[5];
+} const tests[] = {
+ /* Test for newline handling in regex. */
+ { "[^~]*~", "\nx~y", 0, 2, { { 0, 3 }, { -1, -1 } } },
+ /* Other tests. */
+ { "a(.*)b", "a b", REG_EXTENDED, 2, { { 0, 3 }, { 1, 2 } } },
+ { ".*|\\([KIO]\\)\\([^|]*\\).*|?[KIO]", "10~.~|P|K0|I10|O16|?KSb", 0, 3,
+ { { 0, 21 }, { 15, 16 }, { 16, 18 } } },
+ { ".*|\\([KIO]\\)\\([^|]*\\).*|?\\1", "10~.~|P|K0|I10|O16|?KSb", 0, 3,
+ { { 0, 21 }, { 8, 9 }, { 9, 10 } } },
+ { "^\\(a*\\)\\1\\{9\\}\\(a\\{0,9\\}\\)\\([0-9]*;.*[^a]\\2\\([0-9]\\)\\)",
+ "a1;;0a1aa2aaa3aaaa4aaaaa5aaaaaa6aaaaaaa7aaaaaaaa8aaaaaaaaa9aa2aa1a0", 0,
+ 5, { { 0, 67 }, { 0, 0 }, { 0, 1 }, { 1, 67 }, { 66, 67 } } },
+ /* Test for BRE expression anchoring. POSIX says just that this may match;
+ in glibc regex it always matched, so avoid changing it. */
+ { "\\(^\\|foo\\)bar", "bar", 0, 2, { { 0, 3 }, { -1, -1 } } },
+ { "\\(foo\\|^\\)bar", "bar", 0, 2, { { 0, 3 }, { -1, -1 } } },
+ /* In ERE this must be treated as an anchor. */
+ { "(^|foo)bar", "bar", REG_EXTENDED, 2, { { 0, 3 }, { -1, -1 } } },
+ { "(foo|^)bar", "bar", REG_EXTENDED, 2, { { 0, 3 }, { -1, -1 } } },
+ /* Here ^ cannot be treated as an anchor according to POSIX. */
+ { "(^|foo)bar", "(^|foo)bar", 0, 2, { { 0, 10 }, { -1, -1 } } },
+ { "(foo|^)bar", "(foo|^)bar", 0, 2, { { 0, 10 }, { -1, -1 } } },
+ /* More tests on backreferences. */
+ { "()\\1", "x", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } },
+ { "()x\\1", "x", REG_EXTENDED, 2, { { 0, 1 }, { 0, 0 } } },
+ { "()\\1*\\1*", "", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } },
+ { "([0-9]).*\\1(a*)", "7;7a6", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } },
+ { "([0-9]).*\\1(a*)", "7;7a", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } },
+ { "(b)()c\\1", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 1 }, { 1, 1 } } },
+ { "()(b)c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } },
+ { "a(b)()c\\1", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 2 }, { 2, 2 } } },
+ { "a()(b)c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } } },
+ { "()(b)\\1c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } },
+ { "(b())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 2 }, { 0, 1 }, { 1, 1 } } },
+ { "a()(b)\\1c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } } },
+ { "a()d(b)\\1c\\2", "adbcb", REG_EXTENDED, 3, { { 0, 5 }, { 1, 1 }, { 2, 3 } } },
+ { "a(b())\\2\\1", "abbbb", REG_EXTENDED, 3, { { 0, 3 }, { 1, 2 }, { 2, 2 } } },
+ { "(bb())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 4 }, { 0, 2 }, { 2, 2 } } },
+ { "^([^,]*),\\1,\\1$", "a,a,a", REG_EXTENDED, 2, { { 0, 5 }, { 0, 1 } } },
+ { "^([^,]*),\\1,\\1$", "ab,ab,ab", REG_EXTENDED, 2, { { 0, 8 }, { 0, 2 } } },
+ { "^([^,]*),\\1,\\1,\\1$", "abc,abc,abc,abc", REG_EXTENDED, 2,
+ { { 0, 15 }, { 0, 3 } } },
+ { "^(.?)(.?)(.?)(.?)(.?).?\\5\\4\\3\\2\\1$",
+ "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } },
+ { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$",
+ "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } },
+ { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$",
+ "abcdedcba", REG_EXTENDED, 1, { { 0, 9 } } },
+ /* XXX Not used since they fail so far. */
+ { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$",
+ "ababababa", REG_EXTENDED, 1, { { 0, 9 } } },
+ { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$",
+ "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } },
+ { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$",
+ "ababababa", REG_EXTENDED, 1, { { 0, 9 } } },
+};
+
+static void
+bug_regex11 (void)
+{
+ regex_t re;
+ regmatch_t rm[5];
+ size_t i;
+ int n;
+
+ for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i)
+ {
+ n = regcomp (&re, tests[i].pattern, tests[i].flags);
+ if (n != 0)
+ {
+ char buf[500];
+ regerror (n, &re, buf, sizeof (buf));
+ report_error ("%s: regcomp %zd failed: %s", tests[i].pattern, i, buf);
+ continue;
+ }
+
+ if (regexec (&re, tests[i].string, tests[i].nmatch, rm, 0))
+ {
+ report_error ("%s: regexec %zd failed", tests[i].pattern, i);
+ regfree (&re);
+ continue;
+ }
+
+ for (n = 0; n < tests[i].nmatch; ++n)
+ if (rm[n].rm_so != tests[i].rm[n].rm_so
+ || rm[n].rm_eo != tests[i].rm[n].rm_eo)
+ {
+ if (tests[i].rm[n].rm_so == -1 && tests[i].rm[n].rm_eo == -1)
+ break;
+ report_error ("%s: regexec %zd match failure rm[%d] %d..%d",
+ tests[i].pattern, i, n, rm[n].rm_so, rm[n].rm_eo);
+ break;
+ }
+
+ regfree (&re);
+ }
+}
+
int
main (void)
{
@@ -65,11 +170,15 @@ main (void)
struct re_registers regs;
#if HAVE_DECL_ALARM
- /* Some builds of glibc go into an infinite loop on this test. */
- int alarm_value = 2;
+ /* In case a bug causes glibc to go into an infinite loop.
+ The tests should take less than 10 s on a reasonably modern CPU. */
+ int alarm_value = 1000;
signal (SIGALRM, SIG_DFL);
alarm (alarm_value);
#endif
+
+ bug_regex11 ();
+
if (setlocale (LC_ALL, "en_US.UTF-8"))
{
{
--
2.27.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 08/10] regex: debug check for set member duplicates
2021-02-06 1:25 [PATCH 01/10] regex: improve comments Paul Eggert
` (5 preceding siblings ...)
2021-02-06 1:25 ` [PATCH 07/10] regex: fix longstanding backref match bug Paul Eggert
@ 2021-02-06 1:26 ` Paul Eggert
2021-02-06 1:26 ` [PATCH 09/10] regex-tests: add bug 11053 test Paul Eggert
2021-02-06 1:26 ` [PATCH 10/10] regex: fix comment location Paul Eggert
8 siblings, 0 replies; 13+ messages in thread
From: Paul Eggert @ 2021-02-06 1:26 UTC (permalink / raw)
To: bug-gnulib; +Cc: Paul Eggert
* lib/regex_internal.c (re_node_set_insert): Add a DEBUG_ASSERT
that would have caught some recently-fixed performance bugs
that caused sets to contain duplicate members.
---
ChangeLog | 5 +++++
lib/regex_internal.c | 1 +
2 files changed, 6 insertions(+)
diff --git a/ChangeLog b/ChangeLog
index bd7d1fa16..69abc35dc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
2021-02-05 Paul Eggert <eggert@cs.ucla.edu>
+ regex: debug check for set member duplicates
+ * lib/regex_internal.c (re_node_set_insert): Add a DEBUG_ASSERT
+ that would have caught some recently-fixed performance bugs
+ that caused sets to contain duplicate members.
+
regex: fix longstanding backref match bug
This fixes a longstanding glibc bug concerning backreferences
<https://sourceware.org/11053> (2009-12-04).
diff --git a/lib/regex_internal.c b/lib/regex_internal.c
index 9dd387ef8..55f6b66de 100644
--- a/lib/regex_internal.c
+++ b/lib/regex_internal.c
@@ -1314,6 +1314,7 @@ re_node_set_insert (re_node_set *set, Idx elem)
{
for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
set->elems[idx] = set->elems[idx - 1];
+ DEBUG_ASSERT (set->elems[idx - 1] < elem);
}
/* Insert the new element. */
--
2.27.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 09/10] regex-tests: add bug 11053 test
2021-02-06 1:25 [PATCH 01/10] regex: improve comments Paul Eggert
` (6 preceding siblings ...)
2021-02-06 1:26 ` [PATCH 08/10] regex: debug check for set member duplicates Paul Eggert
@ 2021-02-06 1:26 ` Paul Eggert
2021-02-06 1:26 ` [PATCH 10/10] regex: fix comment location Paul Eggert
8 siblings, 0 replies; 13+ messages in thread
From: Paul Eggert @ 2021-02-06 1:26 UTC (permalink / raw)
To: bug-gnulib; +Cc: Paul Eggert
* tests/test-regex.c (main): New test case for glibc bug 11053.
---
ChangeLog | 3 +++
tests/test-regex.c | 29 +++++++++++++++++++++++++++++
2 files changed, 32 insertions(+)
diff --git a/ChangeLog b/ChangeLog
index 69abc35dc..d838ad0d0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
2021-02-05 Paul Eggert <eggert@cs.ucla.edu>
+ regex-tests: add bug 11053 test
+ * tests/test-regex.c (main): New test case for glibc bug 11053.
+
regex: debug check for set member duplicates
* lib/regex_internal.c (re_node_set_insert): Add a DEBUG_ASSERT
that would have caught some recently-fixed performance bugs
diff --git a/tests/test-regex.c b/tests/test-regex.c
index a14619805..adccf2187 100644
--- a/tests/test-regex.c
+++ b/tests/test-regex.c
@@ -407,6 +407,35 @@ main (void)
free (regs.end);
}
+ /* glibc bug 11053. */
+ re_set_syntax (RE_SYNTAX_POSIX_BASIC);
+ memset (®ex, 0, sizeof regex);
+ static char const pat_sub2[] = "\\(a*\\)*a*\\1";
+ s = re_compile_pattern (pat_sub2, sizeof pat_sub2 - 1, ®ex);
+ if (s)
+ report_error ("%s: %s", pat_sub2, s);
+ else
+ {
+ memset (®s, 0, sizeof regs);
+ static char const data[] = "a";
+ int datalen = sizeof data - 1;
+ i = re_search (®ex, data, datalen, 0, datalen, ®s);
+ if (i != 0)
+ report_error ("re_search '%s' on '%s' returned %d", pat_sub2, data, i);
+ else if (regs.num_regs < 2)
+ report_error ("re_search '%s' on '%s' returned only %d registers",
+ pat_sub2, data, (int) regs.num_regs);
+ else if (! (regs.start[0] == 0 && regs.end[0] == 1))
+ report_error ("re_search '%s' on '%s' returned wrong match [%d,%d)",
+ pat_sub2, data, (int) regs.start[0], (int) regs.end[0]);
+ else if (! (regs.start[1] == 0 && regs.end[1] == 0))
+ report_error ("re_search '%s' on '%s' returned wrong submatch [%d,%d)",
+ pat_sub2, data, regs.start[1], regs.end[1]);
+ regfree (®ex);
+ free (regs.start);
+ free (regs.end);
+ }
+
/* Catch a bug reported by Vin Shelton in
https://lists.gnu.org/r/bug-coreutils/2007-06/msg00089.html
*/
--
2.27.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 10/10] regex: fix comment location
2021-02-06 1:25 [PATCH 01/10] regex: improve comments Paul Eggert
` (7 preceding siblings ...)
2021-02-06 1:26 ` [PATCH 09/10] regex-tests: add bug 11053 test Paul Eggert
@ 2021-02-06 1:26 ` Paul Eggert
8 siblings, 0 replies; 13+ messages in thread
From: Paul Eggert @ 2021-02-06 1:26 UTC (permalink / raw)
To: bug-gnulib; +Cc: Paul Eggert
* lib/regexec.c (update_regs): Move comment.
---
ChangeLog | 3 +++
lib/regexec.c | 2 +-
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/ChangeLog b/ChangeLog
index d838ad0d0..70e357884 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
2021-02-05 Paul Eggert <eggert@cs.ucla.edu>
+ regex: fix comment location
+ * lib/regexec.c (update_regs): Move comment.
+
regex-tests: add bug 11053 test
* tests/test-regex.c (main): New test case for glibc bug 11053.
diff --git a/lib/regexec.c b/lib/regexec.c
index 424bc8d15..6309deac8 100644
--- a/lib/regexec.c
+++ b/lib/regexec.c
@@ -1500,10 +1500,10 @@ update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
}
else if (type == OP_CLOSE_SUBEXP)
{
+ /* We are at the last node of this sub expression. */
Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
if (reg_num < nmatch)
{
- /* We are at the last node of this sub expression. */
if (pmatch[reg_num].rm_so < cur_idx)
{
pmatch[reg_num].rm_eo = cur_idx;
--
2.27.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH 07/10] regex: fix longstanding backref match bug
2021-02-06 1:25 ` [PATCH 07/10] regex: fix longstanding backref match bug Paul Eggert
@ 2021-02-09 19:42 ` Adhemerval Zanella
2021-02-09 22:22 ` Paul Eggert
2021-04-16 21:37 ` Dmitry V. Levin
1 sibling, 1 reply; 13+ messages in thread
From: Adhemerval Zanella @ 2021-02-09 19:42 UTC (permalink / raw)
To: Paul Eggert, bug-gnulib
Hi Paul,
Trying to sync gnulib with glibc code, this patch trigger some regression
on glibc testcases:
FAIL: posix/tst-boost
FAIL: posix/tst-pcre
FAIL: posix/tst-rxspencer
FAIL: posix/tst-rxspencer-no-utf8
$ grep -n "FAIL rm" posix/tst-boost.out
445:FAIL rm[1] 3..-1 != expected 2..3
448:FAIL rm[1] 3..-1 != expected 2..3
451:FAIL rm[1] 3..-1 != expected 2..3
454:FAIL rm[1] 3..-1 != expected 2..3
$ cat posix/tst-pcre.out
1168: /([a]*)*/ on a unexpectedly failed to match register 1 1..-1
1171: /([a]*)*/ on aaaaa unexpectedly failed to match register 1 5..-1
1176: /([ab]*)*/ on a unexpectedly failed to match register 1 1..-1
1179: /([ab]*)*/ on b unexpectedly failed to match register 1 1..-1
1182: /([ab]*)*/ on ababab unexpectedly failed to match register 1 6..-1
1185: /([ab]*)*/ on aaaabcde unexpectedly failed to match register 1 5..-1
1188: /([ab]*)*/ on bbbb unexpectedly failed to match register 1 4..-1
1193: /([^a]*)*/ on b unexpectedly failed to match register 1 1..-1
1196: /([^a]*)*/ on bbbb unexpectedly failed to match register 1 4..-1
1203: /([^ab]*)*/ on cccc unexpectedly failed to match register 1 4..-1
$ cat posix/tst-rxspencer.out | grep FAIL
FAIL rm[1] unexpectedly did not match
FAIL rm[1] unexpectedly did not match
$ cat posix/tst-rxspencer-no-utf8.out | grep FAIL
FAIL rm[1] unexpectedly did not match
FAIL rm[1] unexpectedly did not match
On 05/02/2021 22:25, Paul Eggert wrote:
> This fixes a longstanding glibc bug concerning backreferences
> <https://sourceware.org/11053> (2009-12-04).
> * lib/regexec.c (proceed_next_node, push_fail_stack)
> (pop_fail_stack): Push and pop the previous registers
> as well as the current ones. All callers changed.
> (set_regs): Also pop if CUR_NODE has already been checked,
> so that it does not get added as a duplicate set entry.
> (update_regs): Fix comment location.
> * tests/test-regex.c (tests): New constant.
> (bug_regex11): New test function.
> (main): Bump alarm value. Call new test function.
> ---
> ChangeLog | 13 ++++++
> lib/regexec.c | 26 +++++++----
> tests/test-regex.c | 113 ++++++++++++++++++++++++++++++++++++++++++++-
> 3 files changed, 141 insertions(+), 11 deletions(-)
>
> diff --git a/ChangeLog b/ChangeLog
> index 74304474b..bd7d1fa16 100644
> --- a/ChangeLog
> +++ b/ChangeLog
> @@ -1,5 +1,18 @@
> 2021-02-05 Paul Eggert <eggert@cs.ucla.edu>
>
> + regex: fix longstanding backref match bug
> + This fixes a longstanding glibc bug concerning backreferences
> + <https://sourceware.org/11053> (2009-12-04).
> + * lib/regexec.c (proceed_next_node, push_fail_stack)
> + (pop_fail_stack): Push and pop the previous registers
> + as well as the current ones. All callers changed.
> + (set_regs): Also pop if CUR_NODE has already been checked,
> + so that it does not get added as a duplicate set entry.
> + (update_regs): Fix comment location.
> + * tests/test-regex.c (tests): New constant.
> + (bug_regex11): New test function.
> + (main): Bump alarm value. Call new test function.
> +
> regex: avoid duplicate in espilon closure
> * lib/regcomp.c (calc_eclosure_iter): Insert NODE into epsilon
> closure first rather than last. Otherwise, the epsilon closure
> diff --git a/lib/regexec.c b/lib/regexec.c
> index fdd2e373e..424bc8d15 100644
> --- a/lib/regexec.c
> +++ b/lib/regexec.c
> @@ -59,7 +59,7 @@ static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
> Idx cur_idx, Idx nmatch);
> static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
> Idx str_idx, Idx dest_node, Idx nregs,
> - regmatch_t *regs,
> + regmatch_t *regs, regmatch_t *prevregs,
> re_node_set *eps_via_nodes);
> static reg_errcode_t set_regs (const regex_t *preg,
> const re_match_context_t *mctx,
> @@ -1211,6 +1211,7 @@ check_halt_state_context (const re_match_context_t *mctx,
>
> static Idx
> proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
> + regmatch_t *prevregs,
> Idx *pidx, Idx node, re_node_set *eps_via_nodes,
> struct re_fail_stack_t *fs)
> {
> @@ -1243,7 +1244,7 @@ proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
> /* Otherwise, push the second epsilon-transition on the fail stack. */
> else if (fs != NULL
> && push_fail_stack (fs, *pidx, candidate, nregs, regs,
> - eps_via_nodes))
> + prevregs, eps_via_nodes))
> return -2;
>
> /* We know we are going to exit. */
> @@ -1316,7 +1317,8 @@ proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
> static reg_errcode_t
> __attribute_warn_unused_result__
> push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
> - Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
> + Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
> + re_node_set *eps_via_nodes)
> {
> reg_errcode_t err;
> Idx num = fs->num++;
> @@ -1332,23 +1334,26 @@ push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
> }
> fs->stack[num].idx = str_idx;
> fs->stack[num].node = dest_node;
> - fs->stack[num].regs = re_malloc (regmatch_t, nregs);
> + fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
> if (fs->stack[num].regs == NULL)
> return REG_ESPACE;
> memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
> + memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
> err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
> return err;
> }
>
> static Idx
> pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
> - regmatch_t *regs, re_node_set *eps_via_nodes)
> + regmatch_t *regs, regmatch_t *prevregs,
> + re_node_set *eps_via_nodes)
> {
> if (fs == NULL || fs->num == 0)
> return -1;
> Idx num = --fs->num;
> *pidx = fs->stack[num].idx;
> memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
> + memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
> re_node_set_free (eps_via_nodes);
> re_free (fs->stack[num].regs);
> *eps_via_nodes = fs->stack[num].eps_via_nodes;
> @@ -1408,7 +1413,8 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
> {
> update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
>
> - if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
> + if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
> + || re_node_set_contains (&eps_via_nodes, cur_node))
> {
> Idx reg_idx;
> cur_node = -1;
> @@ -1418,7 +1424,7 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
> if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
> {
> cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
> - &eps_via_nodes);
> + prev_idx_match, &eps_via_nodes);
> break;
> }
> }
> @@ -1431,7 +1437,8 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
> }
>
> /* Proceed to next node. */
> - cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
> + cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
> + &idx, cur_node,
> &eps_via_nodes, fs);
>
> if (__glibc_unlikely (cur_node < 0))
> @@ -1443,7 +1450,8 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
> free_fail_stack_return (fs);
> return REG_ESPACE;
> }
> - cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, &eps_via_nodes);
> + cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
> + prev_idx_match, &eps_via_nodes);
> if (cur_node < 0)
> {
> re_node_set_free (&eps_via_nodes);
> diff --git a/tests/test-regex.c b/tests/test-regex.c
> index 58f8200f2..a14619805 100644
> --- a/tests/test-regex.c
> +++ b/tests/test-regex.c
> @@ -55,6 +55,111 @@ really_utf8 (void)
> return strcmp (locale_charset (), "UTF-8") == 0;
> }
>
> +/* Tests supposed to match; copied from glibc posix/bug-regex11.c. */
> +static struct
> +{
> + const char *pattern;
> + const char *string;
> + int flags, nmatch;
> + regmatch_t rm[5];
> +} const tests[] = {
> + /* Test for newline handling in regex. */
> + { "[^~]*~", "\nx~y", 0, 2, { { 0, 3 }, { -1, -1 } } },
> + /* Other tests. */
> + { "a(.*)b", "a b", REG_EXTENDED, 2, { { 0, 3 }, { 1, 2 } } },
> + { ".*|\\([KIO]\\)\\([^|]*\\).*|?[KIO]", "10~.~|P|K0|I10|O16|?KSb", 0, 3,
> + { { 0, 21 }, { 15, 16 }, { 16, 18 } } },
> + { ".*|\\([KIO]\\)\\([^|]*\\).*|?\\1", "10~.~|P|K0|I10|O16|?KSb", 0, 3,
> + { { 0, 21 }, { 8, 9 }, { 9, 10 } } },
> + { "^\\(a*\\)\\1\\{9\\}\\(a\\{0,9\\}\\)\\([0-9]*;.*[^a]\\2\\([0-9]\\)\\)",
> + "a1;;0a1aa2aaa3aaaa4aaaaa5aaaaaa6aaaaaaa7aaaaaaaa8aaaaaaaaa9aa2aa1a0", 0,
> + 5, { { 0, 67 }, { 0, 0 }, { 0, 1 }, { 1, 67 }, { 66, 67 } } },
> + /* Test for BRE expression anchoring. POSIX says just that this may match;
> + in glibc regex it always matched, so avoid changing it. */
> + { "\\(^\\|foo\\)bar", "bar", 0, 2, { { 0, 3 }, { -1, -1 } } },
> + { "\\(foo\\|^\\)bar", "bar", 0, 2, { { 0, 3 }, { -1, -1 } } },
> + /* In ERE this must be treated as an anchor. */
> + { "(^|foo)bar", "bar", REG_EXTENDED, 2, { { 0, 3 }, { -1, -1 } } },
> + { "(foo|^)bar", "bar", REG_EXTENDED, 2, { { 0, 3 }, { -1, -1 } } },
> + /* Here ^ cannot be treated as an anchor according to POSIX. */
> + { "(^|foo)bar", "(^|foo)bar", 0, 2, { { 0, 10 }, { -1, -1 } } },
> + { "(foo|^)bar", "(foo|^)bar", 0, 2, { { 0, 10 }, { -1, -1 } } },
> + /* More tests on backreferences. */
> + { "()\\1", "x", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } },
> + { "()x\\1", "x", REG_EXTENDED, 2, { { 0, 1 }, { 0, 0 } } },
> + { "()\\1*\\1*", "", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } },
> + { "([0-9]).*\\1(a*)", "7;7a6", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } },
> + { "([0-9]).*\\1(a*)", "7;7a", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } },
> + { "(b)()c\\1", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 1 }, { 1, 1 } } },
> + { "()(b)c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } },
> + { "a(b)()c\\1", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 2 }, { 2, 2 } } },
> + { "a()(b)c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } } },
> + { "()(b)\\1c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } },
> + { "(b())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 2 }, { 0, 1 }, { 1, 1 } } },
> + { "a()(b)\\1c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } } },
> + { "a()d(b)\\1c\\2", "adbcb", REG_EXTENDED, 3, { { 0, 5 }, { 1, 1 }, { 2, 3 } } },
> + { "a(b())\\2\\1", "abbbb", REG_EXTENDED, 3, { { 0, 3 }, { 1, 2 }, { 2, 2 } } },
> + { "(bb())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 4 }, { 0, 2 }, { 2, 2 } } },
> + { "^([^,]*),\\1,\\1$", "a,a,a", REG_EXTENDED, 2, { { 0, 5 }, { 0, 1 } } },
> + { "^([^,]*),\\1,\\1$", "ab,ab,ab", REG_EXTENDED, 2, { { 0, 8 }, { 0, 2 } } },
> + { "^([^,]*),\\1,\\1,\\1$", "abc,abc,abc,abc", REG_EXTENDED, 2,
> + { { 0, 15 }, { 0, 3 } } },
> + { "^(.?)(.?)(.?)(.?)(.?).?\\5\\4\\3\\2\\1$",
> + "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } },
> + { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$",
> + "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } },
> + { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$",
> + "abcdedcba", REG_EXTENDED, 1, { { 0, 9 } } },
> + /* XXX Not used since they fail so far. */
> + { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$",
> + "ababababa", REG_EXTENDED, 1, { { 0, 9 } } },
> + { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$",
> + "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } },
> + { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$",
> + "ababababa", REG_EXTENDED, 1, { { 0, 9 } } },
> +};
> +
> +static void
> +bug_regex11 (void)
> +{
> + regex_t re;
> + regmatch_t rm[5];
> + size_t i;
> + int n;
> +
> + for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i)
> + {
> + n = regcomp (&re, tests[i].pattern, tests[i].flags);
> + if (n != 0)
> + {
> + char buf[500];
> + regerror (n, &re, buf, sizeof (buf));
> + report_error ("%s: regcomp %zd failed: %s", tests[i].pattern, i, buf);
> + continue;
> + }
> +
> + if (regexec (&re, tests[i].string, tests[i].nmatch, rm, 0))
> + {
> + report_error ("%s: regexec %zd failed", tests[i].pattern, i);
> + regfree (&re);
> + continue;
> + }
> +
> + for (n = 0; n < tests[i].nmatch; ++n)
> + if (rm[n].rm_so != tests[i].rm[n].rm_so
> + || rm[n].rm_eo != tests[i].rm[n].rm_eo)
> + {
> + if (tests[i].rm[n].rm_so == -1 && tests[i].rm[n].rm_eo == -1)
> + break;
> + report_error ("%s: regexec %zd match failure rm[%d] %d..%d",
> + tests[i].pattern, i, n, rm[n].rm_so, rm[n].rm_eo);
> + break;
> + }
> +
> + regfree (&re);
> + }
> +}
> +
> int
> main (void)
> {
> @@ -65,11 +170,15 @@ main (void)
> struct re_registers regs;
>
> #if HAVE_DECL_ALARM
> - /* Some builds of glibc go into an infinite loop on this test. */
> - int alarm_value = 2;
> + /* In case a bug causes glibc to go into an infinite loop.
> + The tests should take less than 10 s on a reasonably modern CPU. */
> + int alarm_value = 1000;
> signal (SIGALRM, SIG_DFL);
> alarm (alarm_value);
> #endif
> +
> + bug_regex11 ();
> +
> if (setlocale (LC_ALL, "en_US.UTF-8"))
> {
> {
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 07/10] regex: fix longstanding backref match bug
2021-02-09 19:42 ` Adhemerval Zanella
@ 2021-02-09 22:22 ` Paul Eggert
0 siblings, 0 replies; 13+ messages in thread
From: Paul Eggert @ 2021-02-09 22:22 UTC (permalink / raw)
To: Adhemerval Zanella, bug-gnulib
On 2/9/21 11:42 AM, Adhemerval Zanella wrote:
> Trying to sync gnulib with glibc code, this patch trigger some regression
> on glibc testcases:
Thanks for letting me know. That's odd, as I didn't find any regressions
when I synced. I do know of remaining bugs and will add these to my list.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 07/10] regex: fix longstanding backref match bug
2021-02-06 1:25 ` [PATCH 07/10] regex: fix longstanding backref match bug Paul Eggert
2021-02-09 19:42 ` Adhemerval Zanella
@ 2021-04-16 21:37 ` Dmitry V. Levin
1 sibling, 0 replies; 13+ messages in thread
From: Dmitry V. Levin @ 2021-04-16 21:37 UTC (permalink / raw)
To: Paul Eggert; +Cc: bug-gnulib
On Fri, Feb 05, 2021 at 05:25:59PM -0800, Paul Eggert wrote:
> This fixes a longstanding glibc bug concerning backreferences
> <https://sourceware.org/11053> (2009-12-04).
Nit: this short URL doesn't work, while the longer one does:
https://sourceware.org/bugzilla/show_bug.cgi?id=11053
> * lib/regexec.c (proceed_next_node, push_fail_stack)
> (pop_fail_stack): Push and pop the previous registers
> as well as the current ones. All callers changed.
> (set_regs): Also pop if CUR_NODE has already been checked,
> so that it does not get added as a duplicate set entry.
> (update_regs): Fix comment location.
> * tests/test-regex.c (tests): New constant.
> (bug_regex11): New test function.
> (main): Bump alarm value. Call new test function.
> ---
> ChangeLog | 13 ++++++
> lib/regexec.c | 26 +++++++----
> tests/test-regex.c | 113 ++++++++++++++++++++++++++++++++++++++++++++-
> 3 files changed, 141 insertions(+), 11 deletions(-)
Apparently, this commit changes behaviour of GNU sed in a way that looks
like regression.
When GNU sed is built with this gnulib commit (v0.1-4426-g70b673eb7):
$ echo 123 |sed -r 's/^1*+(.)/\1/'
3
$ echo 123 |sed -r 's/^(1*)*(.)/\2/'
3
When GNU sed is built with the previous gnulib commit (v0.1-4425-gf5596242f):
$ echo 123 |sed -r 's/^1*+(.)/\1/'
23
$ echo 123 |sed -r 's/^(1*)*(.)/\2/'
23
There are regexps in the wild containing "*+" that used to work before,
see e.g.
https://lore.kernel.org/lkml/20210414182723.1670663-1-vt@altlinux.org/T/#u
--
ldv
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2021-04-16 22:15 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-06 1:25 [PATCH 01/10] regex: improve comments Paul Eggert
2021-02-06 1:25 ` [PATCH 02/10] regex: avoid undefined behavior Paul Eggert
2021-02-06 1:25 ` [PATCH 03/10] regex: minor refactoring Paul Eggert
2021-02-06 1:25 ` [PATCH 04/10] regex: make it easier to merge into glibc Paul Eggert
2021-02-06 1:25 ` [PATCH 05/10] regex-tests: fix typo Paul Eggert
2021-02-06 1:25 ` [PATCH 06/10] regex: avoid duplicate in espilon closure Paul Eggert
2021-02-06 1:25 ` [PATCH 07/10] regex: fix longstanding backref match bug Paul Eggert
2021-02-09 19:42 ` Adhemerval Zanella
2021-02-09 22:22 ` Paul Eggert
2021-04-16 21:37 ` Dmitry V. Levin
2021-02-06 1:26 ` [PATCH 08/10] regex: debug check for set member duplicates Paul Eggert
2021-02-06 1:26 ` [PATCH 09/10] regex-tests: add bug 11053 test Paul Eggert
2021-02-06 1:26 ` [PATCH 10/10] regex: fix comment location Paul Eggert
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).