bug-gnulib@gnu.org mirror (unofficial)
 help / color / mirror / Atom feed
From: Egor Ignatov <egori@altlinux.org>
To: ldv@altlinux.org
Cc: eggert@cs.ucla.edu, bug-gnulib@gnu.org
Subject: [PATCH v2] regex: fix backreference matching
Date: Fri,  9 Jul 2021 15:36:43 +0300	[thread overview]
Message-ID: <20210709123643.60443-1-egori@altlinux.org> (raw)
In-Reply-To: <20210705121201.GA20072@altlinux.org>

* lib/regexec.c
(proceed_next_node): Disable dest_node check if we have backrefs

(set_regs):Finish set_regs when we are at the last node and all
regs have been set.

(set_regs):
Also shrink the match if we ready to finish but didn't accept the entire
string matched by check_matching.  Because check_matching may return
a wrong match for regexp with back-references. For example
check_matching regex '(a*)*(.)\1' and string 'ab' results in the
match 'ab' where it should be just 'a' in the second capturing group.

All built in tests as well as test from sed and grep have passed.

Signed-off-by: Egor Ignatov <egori@altlinux.org>
---
 lib/regexec.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/lib/regexec.c b/lib/regexec.c
index 5e4eb497a..8f0f14575 100644
--- a/lib/regexec.c
+++ b/lib/regexec.c
@@ -1233,7 +1233,7 @@ proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
       for (Idx i = 0; i < edests->nelem; i++)
 	{
 	  Idx candidate = edests->elems[i];
-	  if (!re_node_set_contains (cur_nodes, candidate))
+	  if (!dfa->nbackref && !re_node_set_contains (cur_nodes, candidate))
 	    continue;
           if (dest_node == -1)
 	    dest_node = candidate;
@@ -1296,9 +1296,7 @@ proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
 	      if (__glibc_unlikely (! ok))
 		return -2;
 	      dest_node = dfa->edests[node].elems[0];
-	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
-					dest_node))
-		return dest_node;
+	      return dest_node;
 	    }
 	}
 
@@ -1308,8 +1306,9 @@ proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
 	  Idx dest_node = dfa->nexts[node];
 	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
 	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
-		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
-					       dest_node)))
+		     || (!dfa->nbackref &&
+			 !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
+						dest_node))))
 	    return -1;
 	  re_node_set_empty (eps_via_nodes);
 	  return dest_node;
@@ -1417,8 +1416,7 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
     {
       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
 
-      if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
-	  || (fs && re_node_set_contains (&eps_via_nodes, cur_node)))
+      if (cur_node == mctx->last_node)
 	{
 	  Idx reg_idx;
 	  cur_node = -1;
@@ -1434,6 +1432,7 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
 	    }
 	  if (cur_node < 0)
 	    {
+	      pmatch[0].rm_eo = idx;
 	      re_node_set_free (&eps_via_nodes);
 	      regmatch_list_free (&prev_match);
 	      return free_fail_stack_return (fs);
-- 
2.29.3



  reply	other threads:[~2021-07-09 12:38 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-26  9:08 [PATCH] regex: fix match with possessive quantifier Egor Ignatov
2021-06-06 21:45 ` Dmitry V. Levin
2021-06-07  1:10   ` Dmitry V. Levin
2021-06-16  9:46     ` [PATCH] regex: fix backreference matching Egor Ignatov
2021-06-16 10:13       ` Dmitry V. Levin
2021-06-29  8:51         ` Egor Ignatov
2021-07-05 12:12           ` Dmitry V. Levin
2021-07-09 12:36             ` Egor Ignatov [this message]
2021-06-16 10:18     ` [PATCH] regex: fix match with possessive quantifier Dmitry V. Levin
2021-06-21 21:09   ` Paul Eggert
2021-06-22 15:35     ` Egor Ignatov
2021-06-22 15:35       ` [PATCH] regex: fix assertion in re_node_set_insert Egor Ignatov
2021-06-22 19:41         ` Paul Eggert

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://lists.gnu.org/mailman/listinfo/bug-gnulib

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210709123643.60443-1-egori@altlinux.org \
    --to=egori@altlinux.org \
    --cc=bug-gnulib@gnu.org \
    --cc=eggert@cs.ucla.edu \
    --cc=ldv@altlinux.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).