bug-gnulib@gnu.org mirror (unofficial)
 help / color / mirror / Atom feed
From: Paul Eggert <eggert@cs.ucla.edu>
To: bug-gnulib@gnu.org
Cc: Paul Eggert <eggert@cs.ucla.edu>
Subject: [PATCH 2/2] dfa: epsilon-closure tweaks (Bug#40634)
Date: Sat, 12 Sep 2020 18:54:57 -0700	[thread overview]
Message-ID: <20200913015457.21825-2-eggert@cs.ucla.edu> (raw)
In-Reply-To: <20200913015457.21825-1-eggert@cs.ucla.edu>

Rename BACKWORD to BACKWARD consistently.
* lib/dfa.c (struct dfa): Reorder members to reduce fragmentation.
(addtok_mb): Redo slightly to make it act more like a state machine.
Check depth only when it increases.
(epsclosure): Let the switch test the tokens.
(dfaanalyze): Cache tindex.  Simplify position loops.
Prefer xcalloc to xnmalloc + explicit zeroing.  Free BACKWARD
only if it is not null, since we're testing that anyway.
(dfaanalyze, build_state): Use merge2 instead of doing it by hand.
---
 ChangeLog |  27 ++++++++++++
 lib/dfa.c | 124 ++++++++++++++++++++++++++----------------------------
 2 files changed, 87 insertions(+), 64 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index bf39cc512..66aec61cb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,30 @@
+2020-09-12  Paul Eggert  <eggert@cs.ucla.edu>
+
+	dfa: epsilon-closure tweaks (Bug#40634)
+	Rename BACKWORD to BACKWARD consistently.
+	* lib/dfa.c (struct dfa): Reorder members to reduce fragmentation.
+	(addtok_mb): Redo slightly to make it act more like a state machine.
+	Check depth only when it increases.
+	(epsclosure): Let the switch test the tokens.
+	(dfaanalyze): Cache tindex.  Simplify position loops.
+	Prefer xcalloc to xnmalloc + explicit zeroing.  Free BACKWARD
+	only if it is not null, since we're testing that anyway.
+	(dfaanalyze, build_state): Use merge2 instead of doing it by hand.
+
+2020-09-12  Norihiro Tanaka  <noritnk@kcn.ne.jp>
+
+	dfa: use backward set in removal of epsilon closure
+	When removing in epsilon closure, the code searched all nodes
+	sequentially, and this was slow for some cases.  Build a backward
+	set before search, and only check previous position with the set.
+	Problem reported in <https://bugs.gnu.org/40634>.
+	* lib/dfa.c (struct dfa): New member 'epsilon'.
+	(addtok_mb): Check whether a pattern has epsilon node or not.
+	(epsclosure): New arg BACKWORD; caller changed.  When removing
+	epsilon node and reconnecting, check only previous positions.
+	Treat BEG as if it were character.
+	(dfaanalyze): Build backward set.
+
 2020-09-10  Paul Eggert  <eggert@cs.ucla.edu>
 
 	canonicalize: fix pointer indexing bugs
diff --git a/lib/dfa.c b/lib/dfa.c
index 7c8eb6685..0f0764887 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -482,13 +482,14 @@ struct dfa
   idx_t depth;			/* Depth required of an evaluation stack
                                    used for depth-first traversal of the
                                    parse tree.  */
-  idx_t nleaves;		/* Number of leaves on the parse tree.  */
+  idx_t nleaves;		/* Number of non-EMPTY leaves
+                                   in the parse tree.  */
   idx_t nregexps;		/* Count of parallel regexps being built
                                    with dfaparse.  */
   bool fast;			/* The DFA is fast.  */
+  bool epsilon;			/* Does a token match only the empty string?  */
   token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales.  */
   mbstate_t mbs;		/* Multibyte conversion state.  */
-  bool epsilon;
 
   /* The following are valid only if MB_CUR_MAX > 1.  */
 
@@ -1616,26 +1617,31 @@ addtok_mb (struct dfa *dfa, token t, char mbprop)
       dfa->parse.depth--;
       break;
 
+    case EMPTY:
+      dfa->epsilon = true;
+      goto increment_depth;
+
+    case BACKREF:
+      dfa->fast = false;
+      goto increment_nleaves;
+
     case BEGLINE:
     case ENDLINE:
     case BEGWORD:
     case ENDWORD:
     case LIMWORD:
     case NOTLIMWORD:
-    case EMPTY:
       dfa->epsilon = true;
       FALLTHROUGH;
-
     default:
-      if (t == BACKREF)
-        dfa->fast = false;
-      if (t != EMPTY)
-        dfa->nleaves++;
+    increment_nleaves:
+      dfa->nleaves++;
+    increment_depth:
       dfa->parse.depth++;
+      if (dfa->depth < dfa->parse.depth)
+        dfa->depth = dfa->parse.depth;
       break;
     }
-  if (dfa->parse.depth > dfa->depth)
-    dfa->depth = dfa->parse.depth;
 }
 
 static void addtok_wc (struct dfa *dfa, wint_t wc);
@@ -2156,6 +2162,8 @@ merge (position_set const *s1, position_set const *s2, position_set *m)
   merge_constrained (s1, s2, -1, m);
 }
 
+/* Merge into DST all the elements of SRC, possibly destroying
+   the contents of the temporary M.  */
 static void
 merge2 (position_set *dst, position_set const *src, position_set *m)
 {
@@ -2300,25 +2308,26 @@ state_index (struct dfa *d, position_set const *s, int context)
   return i;
 }
 
-/* Find the epsilon closure of a set of positions.  If any position of the set
+/* Find the epsilon closure of D's set of positions.  If any position of the set
    contains a symbol that matches the empty string in some context, replace
    that position with the elements of its follow labeled with an appropriate
    constraint.  Repeat exhaustively until no funny positions are left.
-   S->elems must be large enough to hold the result.  */
+   S->elems must be large enough to hold the result.  BACKWARD is D's
+   backward set; use and update it too.  */
 static void
-epsclosure (struct dfa const *d, position_set *backword)
+epsclosure (struct dfa const *d, position_set *backward)
 {
   position_set tmp;
   alloc_position_set (&tmp, d->nleaves);
   for (idx_t i = 0; i < d->tindex; i++)
-    if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
-        && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
-        && d->tokens[i] != MBCSET && d->tokens[i] < CSET
-        && d->tokens[i] != BEG)
+    if (0 < d->follows[i].nelem)
       {
         unsigned int constraint;
         switch (d->tokens[i])
           {
+          default:
+            continue;
+
           case BEGLINE:
             constraint = BEGLINE_CONSTRAINT;
             break;
@@ -2340,17 +2349,15 @@ epsclosure (struct dfa const *d, position_set *backword)
           case EMPTY:
             constraint = NO_CONSTRAINT;
             break;
-          default:
-            abort ();
           }
 
         delete (i, &d->follows[i]);
 
-        for (idx_t j = 0; j < backword[i].nelem; j++)
-          replace (&d->follows[backword[i].elems[j].index], i, &d->follows[i],
+        for (idx_t j = 0; j < backward[i].nelem; j++)
+          replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
                    constraint, &tmp);
         for (idx_t j = 0; j < d->follows[i].nelem; j++)
-          replace (&backword[d->follows[i].elems[j].index], i, &backword[i],
+          replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
                    NO_CONSTRAINT, &tmp);
       }
   free (tmp.elems);
@@ -2658,7 +2665,6 @@ dfaanalyze (struct dfa *d, bool searchflag)
   /* Firstpos and lastpos elements.  */
   position *firstpos = posalloc;
   position *lastpos = firstpos + d->nleaves;
-  position_set *backword = NULL;
   position pos;
   position_set tmp;
 
@@ -2676,10 +2682,11 @@ dfaanalyze (struct dfa *d, bool searchflag)
   position_set merged;          /* Result of merging sets.  */
 
   addtok (d, CAT);
+  idx_t tindex = d->tindex;
 
 #ifdef DEBUG
   fprintf (stderr, "dfaanalyze:\n");
-  for (idx_t i = 0; i < d->tindex; i++)
+  for (idx_t i = 0; i < tindex; i++)
     {
       fprintf (stderr, " %td:", i);
       prtok (d->tokens[i]);
@@ -2689,12 +2696,11 @@ dfaanalyze (struct dfa *d, bool searchflag)
 
   d->searchflag = searchflag;
   alloc_position_set (&merged, d->nleaves);
-  d->follows = xcalloc (d->tindex, sizeof *d->follows);
-
-  if (d->epsilon)
-    backword = xcalloc (d->tindex, sizeof *backword);
+  d->follows = xcalloc (tindex, sizeof *d->follows);
+  position_set *backward
+    = d->epsilon ? xcalloc (tindex, sizeof *backward) : NULL;
 
-  for (idx_t i = 0; i < d->tindex; i++)
+  for (idx_t i = 0; i < tindex; i++)
     {
       switch (d->tokens[i])
         {
@@ -2714,12 +2720,8 @@ dfaanalyze (struct dfa *d, bool searchflag)
           {
             tmp.elems = firstpos - stk[-1].nfirstpos;
             tmp.nelem = stk[-1].nfirstpos;
-            position *p = lastpos - stk[-1].nlastpos;
-            for (idx_t j = 0; j < stk[-1].nlastpos; j++)
-              {
-                merge (&tmp, &d->follows[p[j].index], &merged);
-                copy (&merged, &d->follows[p[j].index]);
-              }
+            for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
+              merge2 (&d->follows[p->index], &tmp, &merged);
           }
           FALLTHROUGH;
         case QMARK:
@@ -2729,28 +2731,27 @@ dfaanalyze (struct dfa *d, bool searchflag)
           break;
 
         case CAT:
-          /* Every element in the firstpos of the second argument is in the
-             follow of every element in the lastpos of the first argument.  */
-          if (d->epsilon)
+          /* Every element in the lastpos of the first argument is in
+             the backward set of every element in the firstpos of the
+             second argument.  */
+          if (backward)
             {
               tmp.nelem = stk[-2].nlastpos;
               tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
-              position *p = firstpos - stk[-1].nfirstpos;
-              for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
-                {
-                  merge (&tmp, &backword[p[j].index], &merged);
-                  copy (&merged, &backword[p[j].index]);
-                }
+              for (position *p = firstpos - stk[-1].nfirstpos;
+                   p < firstpos; p++)
+                merge2 (&backward[p->index], &tmp, &merged);
             }
+
+          /* Every element in the firstpos of the second argument is in the
+             follow of every element in the lastpos of the first argument.  */
           {
             tmp.nelem = stk[-1].nfirstpos;
             tmp.elems = firstpos - stk[-1].nfirstpos;
-            position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
-            for (idx_t j = 0; j < stk[-2].nlastpos; j++)
-              {
-                merge (&tmp, &d->follows[p[j].index], &merged);
-                copy (&merged, &d->follows[p[j].index]);
-              }
+            for (position *plim = lastpos - stk[-1].nlastpos,
+                   *p = plim - stk[-2].nlastpos;
+                 p < plim; p++)
+              merge2 (&d->follows[p->index], &tmp, &merged);
           }
 
           /* The firstpos of a CAT node is the firstpos of the first argument,
@@ -2831,20 +2832,21 @@ dfaanalyze (struct dfa *d, bool searchflag)
 #endif
     }
 
-  if (d->epsilon)
+  if (backward)
     {
       /* For each follow set that is the follow set of a real position,
          replace it with its epsilon closure.  */
-      epsclosure (d, backword);
+      epsclosure (d, backward);
 
-      for (idx_t i = 0; i < d->tindex; i++)
-        free (backword[i].elems);
+      for (idx_t i = 0; i < tindex; i++)
+        free (backward[i].elems);
+      free (backward);
     }
 
   dfaoptimize (d);
 
 #ifdef DEBUG
-  for (idx_t i = 0; i < d->tindex; i++)
+  for (idx_t i = 0; i < tindex; i++)
     if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
         || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
         || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
@@ -2868,12 +2870,10 @@ dfaanalyze (struct dfa *d, bool searchflag)
 
   append (pos, &tmp);
 
-  d->separates = xnmalloc (d->tindex, sizeof *d->separates);
+  d->separates = xcalloc (tindex, sizeof *d->separates);
 
-  for (idx_t i = 0; i < d->tindex; i++)
+  for (idx_t i = 0; i < tindex; i++)
     {
-      d->separates[i] = 0;
-
       if (prev_newline_dependent (d->constraints[i]))
         d->separates[i] |= CTX_NEWLINE;
       if (prev_letter_dependent (d->constraints[i]))
@@ -2901,7 +2901,6 @@ dfaanalyze (struct dfa *d, bool searchflag)
   d->min_trcount++;
   d->trcount = 0;
 
-  free (backword);
   free (posalloc);
   free (stkalloc);
   free (merged.elems);
@@ -3172,10 +3171,7 @@ build_state (state_num s, struct dfa *d, unsigned char uc)
                 mergeit &= d->multibyte_prop[group.elems[j].index];
             }
           if (mergeit)
-            {
-              merge (&d->states[0].elems, &group, &tmp);
-              copy (&tmp, &group);
-            }
+            merge2 (&group, &d->states[0].elems, &tmp);
         }
 
       /* Find out if the new state will want any context information,
-- 
2.17.1



      reply	other threads:[~2020-09-13  1:55 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-13  1:54 [PATCH 1/2] dfa: use backward set in removal of epsilon closure Paul Eggert
2020-09-13  1:54 ` Paul Eggert [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://lists.gnu.org/mailman/listinfo/bug-gnulib

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200913015457.21825-2-eggert@cs.ucla.edu \
    --to=eggert@cs.ucla.edu \
    --cc=bug-gnulib@gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).