dobrokot/fast_grep_patch2.diff

## fast_grep_patch2.diff
--- dfa.c.back  2013-08-05 16:53:47.824514544 +0400
+++ dfa.c       2013-08-25 04:23:18.783885918 +0400
@@ -1992,6 +1992,115 @@
       s->elems[i] = s->elems[i + 1];
 }

+/* merge implementation of sorted sets, then number of sets smore than 2
+   priority queue aka 'heap' data structure used to select maximum of
+   many elements */
+
+typedef struct {
+  position p;
+  size_t set_index; /* remember which set this element was taken from */
+} heap_elem;
+
+
+static void
+heap_add (heap_elem *heap, size_t *heap_size, heap_elem e)
+{
+  size_t i = *heap_size;
+  while (i) {
+    size_t parent = (i-1) / 2;
+    if (e.p.index < heap[parent].p.index)
+      break;
+
+    heap[i] = heap[parent];
+    i = parent;
+  }
+  heap[i] = e;
+  *heap_size += 1;
+}
+
+static void
+heap_remove_top_and_add (heap_elem *heap, size_t heap_size, heap_elem e)
+{
+  size_t i = 0;
+  size_t lchild  = 2*i + 1;
+  size_t n = heap_size;
+  while (lchild < n) {
+    size_t rchild = lchild + 1;
+    size_t nextchild = lchild;
+    if (rchild < n && heap[lchild].p.index < heap[rchild].p.index)
+      nextchild = rchild;
+    if (e.p.index >= heap[nextchild].p.index)
+      break;
+    heap[i] = heap[nextchild];
+    i = nextchild;
+    lchild = 2*i + 1;
+  }
+  heap[i] = e;
+}
+
+
+static void
+heap_remove_top (heap_elem *heap, size_t *heap_size)
+{
+  *heap_size -= 1;
+  if (*heap_size == 0)
+    return;
+  heap_remove_top_and_add(heap, *heap_size, heap[*heap_size]);
+}
+
+
+static void
+merge_many_sets (position_set *sets, size_t n, position_set *result)
+{
+  //TODO: special cases for n == 1, and n == 2
+  //heap_elem *heap = (heap_elem*)malloc(n * sizeof(heap_elem));
+  size_t heap_size = 0;
+  heap_elem *heap = XNMALLOC(n, heap_elem);
+  size_t *ids = XNMALLOC(n, size_t);
+  size_t sum_of_sizes = 0;
+  size_t i;
+
+  for (i = 0; i < n; ++i) {
+    sum_of_sizes += sets[i].nelem;
+    /* special value for "end of set", to reduce
+       dependent memory reads */
+    ids[i] = ~(size_t)0;
+    if (sets[i].nelem) {
+      heap_elem e = { sets[i].elems[0], i };
+      heap_add(heap, &heap_size, e);
+      if (sets[i].nelem > 1)
+        ids[i] = 1;
+    }
+  }
+
+  position_set r;
+  REALLOC_IF_NECESSARY(r.elems, r.alloc, sum_of_sizes);
+  r.nelem = 0;
+
+  while (heap_size) {
+    heap_elem e = heap[0];
+    if (r.nelem && r.elems[r.nelem-1].index == e.p.index)
+      r.elems[r.nelem-1].constraint |= e.p.constraint;
+    else {
+      r.elems[r.nelem++] = e.p;
+    }
+    i = e.set_index;
+    if (ids[i] != ~(size_t)0) {
+      heap_elem e2 = { sets[i].elems[ids[i]], i };
+      heap_remove_top_and_add(heap, heap_size, e2);
+      ++ids[i];
+      if (ids[i] == sets[i].nelem)
+        ids[i] = ~(size_t)0;
+    }
+    else {
+      heap_remove_top(heap, &heap_size);
+    }
+  }
+  *result = r;
+  free(ids);
+  free(heap);
+}
+
 /* Find the index of the state corresponding to the given position set with
    the given preceding context, or create a new state if there is no such
    state.  Context tells whether we got here on a newline or letter. */
@@ -2482,7 +2591,7 @@
   charclass leftovers;          /* Stuff in the label that didn't match. */
   int leftoversf;               /* True if leftovers is nonempty. */
   position_set follows;         /* Union of the follows of some group. */
-  position_set tmp;             /* Temporary space for merging sets. */
+  position_set follows_tmp;             /* Temporary space for merging sets. */
   int possible_contexts;        /* Contexts that this group can match. */
   int separate_contexts;        /* Context that new state wants to know. */
   state_num state;              /* New state. */
@@ -2607,7 +2716,11 @@
     }

   alloc_position_set (&follows, d->nleaves);
-  alloc_position_set (&tmp, d->nleaves);
+  alloc_position_set (&follows_tmp, d->nleaves);
+
+  position_set *sets = 0;
+  size_t sets_n = 0, sets_alloc = 0;
+

   /* If we are a searching matcher, the default transition is to a state
      containing the positions of state 0, otherwise the default transition
@@ -2637,13 +2750,12 @@

   for (i = 0; i < ngrps; ++i)
     {
-      follows.nelem = 0;
-
-      /* Find the union of the follows of the positions of the group.
-         This is a hideously inefficient loop.  Fix it someday. */
+      /* Find the union of the follows of the positions of the group.  */
+      sets_n = grps[i].nelem;
+      REALLOC_IF_NECESSARY(sets, sets_alloc, sets_n);
       for (j = 0; j < grps[i].nelem; ++j)
-        for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
-          insert (d->follows[grps[i].elems[j]].elems[k], &follows);
+        sets[j] = d->follows[grps[i].elems[j]];
+      merge_many_sets(sets, sets_n, &follows_tmp);

       if (d->mb_cur_max > 1)
         {
@@ -2666,9 +2778,9 @@
              <mb A>, so we cannot add state[0].  */

           next_isnt_1st_byte = 0;
-          for (j = 0; j < follows.nelem; ++j)
+          for (j = 0; j < follows_tmp.nelem; ++j)
             {
-              if (!(d->multibyte_prop[follows.elems[j].index] & 1))
+              if (!(d->multibyte_prop[follows_tmp.elems[j].index] & 1))
                 {
                   next_isnt_1st_byte = 1;
                   break;
@@ -2679,11 +2791,8 @@
       /* If we are building a searching matcher, throw in the positions
          of state 0 as well. */
       if (d->searchflag
-          && (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte))) {
-        for (j = 0; j < d->states[0].elems.nelem; ++j) {
-          insert(d->states[0].elems.elems[j], &follows);
-        }
-      }
+          && (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte)))
+        merge(&follows_tmp, &d->states[0].elems, &follows);

       /* Find out if the new state will want any context information. */
       possible_contexts = charclass_context (labels[i]);
@@ -2722,7 +2831,8 @@
   for (i = 0; i < ngrps; ++i)
     free (grps[i].elems);
   free (follows.elems);
-  free (tmp.elems);
+  free (sets);
+  free (follows_tmp.elems);
   free (grps);
   free (labels);
 }
	--- dfa.c.back 2013-08-05 16:53:47.824514544 +0400
	+++ dfa.c 2013-08-25 04:23:18.783885918 +0400
	@@ -1992,6 +1992,115 @@
	s->elems[i] = s->elems[i + 1];
	}

	+/* merge implementation of sorted sets, then number of sets smore than 2
	+ priority queue aka 'heap' data structure used to select maximum of
	+ many elements */
	+
	+typedef struct {
	+ position p;
	+ size_t set_index; /* remember which set this element was taken from */
	+} heap_elem;
	+
	+
	+static void
	+heap_add (heap_elem heap, size_t heap_size, heap_elem e)
	+{
	+ size_t i = *heap_size;
	+ while (i) {
	+ size_t parent = (i-1) / 2;
	+ if (e.p.index < heap[parent].p.index)
	+ break;
	+
	+ heap[i] = heap[parent];
	+ i = parent;
	+ }
	+ heap[i] = e;
	+ *heap_size += 1;
	+}
	+
	+static void
	+heap_remove_top_and_add (heap_elem *heap, size_t heap_size, heap_elem e)
	+{
	+ size_t i = 0;
	+ size_t lchild = 2*i + 1;
	+ size_t n = heap_size;
	+ while (lchild < n) {
	+ size_t rchild = lchild + 1;
	+ size_t nextchild = lchild;
	+ if (rchild < n && heap[lchild].p.index < heap[rchild].p.index)
	+ nextchild = rchild;
	+ if (e.p.index >= heap[nextchild].p.index)
	+ break;
	+ heap[i] = heap[nextchild];
	+ i = nextchild;
	+ lchild = 2*i + 1;
	+ }
	+ heap[i] = e;
	+}
	+
	+
	+static void
	+heap_remove_top (heap_elem heap, size_t heap_size)
	+{
	+ *heap_size -= 1;
	+ if (*heap_size == 0)
	+ return;
	+ heap_remove_top_and_add(heap, heap_size, heap[heap_size]);
	+}
	+
	+
	+static void
	+merge_many_sets (position_set sets, size_t n, position_set result)
	+{
	+ //TODO: special cases for n == 1, and n == 2
	+ //heap_elem heap = (heap_elem)malloc(n * sizeof(heap_elem));
	+ size_t heap_size = 0;
	+ heap_elem *heap = XNMALLOC(n, heap_elem);
	+ size_t *ids = XNMALLOC(n, size_t);
	+ size_t sum_of_sizes = 0;
	+ size_t i;
	+
	+ for (i = 0; i < n; ++i) {
	+ sum_of_sizes += sets[i].nelem;
	+ /* special value for "end of set", to reduce
	+ dependent memory reads */
	+ ids[i] = ~(size_t)0;
	+ if (sets[i].nelem) {
	+ heap_elem e = { sets[i].elems[0], i };
	+ heap_add(heap, &heap_size, e);
	+ if (sets[i].nelem > 1)
	+ ids[i] = 1;
	+ }
	+ }
	+
	+ position_set r;
	+ REALLOC_IF_NECESSARY(r.elems, r.alloc, sum_of_sizes);
	+ r.nelem = 0;
	+
	+ while (heap_size) {
	+ heap_elem e = heap[0];
	+ if (r.nelem && r.elems[r.nelem-1].index == e.p.index)
	+ r.elems[r.nelem-1].constraint \|= e.p.constraint;
	+ else {
	+ r.elems[r.nelem++] = e.p;
	+ }
	+ i = e.set_index;
	+ if (ids[i] != ~(size_t)0) {
	+ heap_elem e2 = { sets[i].elems[ids[i]], i };
	+ heap_remove_top_and_add(heap, heap_size, e2);
	+ ++ids[i];
	+ if (ids[i] == sets[i].nelem)
	+ ids[i] = ~(size_t)0;
	+ }
	+ else {
	+ heap_remove_top(heap, &heap_size);
	+ }
	+ }
	+ *result = r;
	+ free(ids);
	+ free(heap);
	+}
	+
	/* Find the index of the state corresponding to the given position set with
	the given preceding context, or create a new state if there is no such
	state. Context tells whether we got here on a newline or letter. */
	@@ -2482,7 +2591,7 @@
	charclass leftovers; /* Stuff in the label that didn't match. */
	int leftoversf; /* True if leftovers is nonempty. */
	position_set follows; /* Union of the follows of some group. */
	- position_set tmp; /* Temporary space for merging sets. */
	+ position_set follows_tmp; /* Temporary space for merging sets. */
	int possible_contexts; /* Contexts that this group can match. */
	int separate_contexts; /* Context that new state wants to know. */
	state_num state; /* New state. */
	@@ -2607,7 +2716,11 @@
	}

	alloc_position_set (&follows, d->nleaves);
	- alloc_position_set (&tmp, d->nleaves);
	+ alloc_position_set (&follows_tmp, d->nleaves);
	+
	+ position_set *sets = 0;
	+ size_t sets_n = 0, sets_alloc = 0;
	+

	/* If we are a searching matcher, the default transition is to a state
	containing the positions of state 0, otherwise the default transition
	@@ -2637,13 +2750,12 @@

	for (i = 0; i < ngrps; ++i)
	{
	- follows.nelem = 0;
	-
	- /* Find the union of the follows of the positions of the group.
	- This is a hideously inefficient loop. Fix it someday. */
	+ /* Find the union of the follows of the positions of the group. */
	+ sets_n = grps[i].nelem;
	+ REALLOC_IF_NECESSARY(sets, sets_alloc, sets_n);
	for (j = 0; j < grps[i].nelem; ++j)
	- for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
	- insert (d->follows[grps[i].elems[j]].elems[k], &follows);
	+ sets[j] = d->follows[grps[i].elems[j]];
	+ merge_many_sets(sets, sets_n, &follows_tmp);

	if (d->mb_cur_max > 1)
	{
	@@ -2666,9 +2778,9 @@
	<mb A>, so we cannot add state[0]. */

	next_isnt_1st_byte = 0;
	- for (j = 0; j < follows.nelem; ++j)
	+ for (j = 0; j < follows_tmp.nelem; ++j)
	{
	- if (!(d->multibyte_prop[follows.elems[j].index] & 1))
	+ if (!(d->multibyte_prop[follows_tmp.elems[j].index] & 1))
	{
	next_isnt_1st_byte = 1;
	break;
	@@ -2679,11 +2791,8 @@
	/* If we are building a searching matcher, throw in the positions
	of state 0 as well. */
	if (d->searchflag
	- && (!MBS_SUPPORT \|\| (d->mb_cur_max == 1 \|\| !next_isnt_1st_byte))) {
	- for (j = 0; j < d->states[0].elems.nelem; ++j) {
	- insert(d->states[0].elems.elems[j], &follows);
	- }
	- }
	+ && (!MBS_SUPPORT \|\| (d->mb_cur_max == 1 \|\| !next_isnt_1st_byte)))
	+ merge(&follows_tmp, &d->states[0].elems, &follows);

	/* Find out if the new state will want any context information. */
	possible_contexts = charclass_context (labels[i]);
	@@ -2722,7 +2831,8 @@
	for (i = 0; i < ngrps; ++i)
	free (grps[i].elems);
	free (follows.elems);
	- free (tmp.elems);
	+ free (sets);
	+ free (follows_tmp.elems);
	free (grps);
	free (labels);
	}