MasterDuke17/p Secret

## gistfile1.txt
filler

## p
diff --git a/src/strings/ops.c b/src/strings/ops.c
index 72a30b9..9db71af 100644
--- a/src/strings/ops.c
+++ b/src/strings/ops.c
@@ -1,4 +1,11 @@
 #include "moar.h"
+#include <stdint.h>
+#include <stdlib.h>
+
+#define ALPHABET_LEN 256
+#define NOT_FOUND patlen
+
+#define max(a, b) (((a) < (b)) ? (b) : (a))

 #define MVM_DEBUG_STRANDS 0

@@ -196,6 +203,135 @@ MVMGrapheme32 MVM_string_get_grapheme_at_nocheck(MVMThreadContext *tc, MVMString
     }
 }

+// delta1 table: delta1[c] contains the distance between the last
+// character of pat and the rightmost occurrence of c in pat.
+// If c does not occur in pat, then delta1[c] = patlen.
+// If c is at string[i] and c != pat[patlen-1], we can
+// safely shift i over by delta1[c], which is the minimum distance
+// needed to shift pat forward to get string[i] lined up
+// with some character in pat.
+// this algorithm runs in alphabet_len+patlen time.
+void make_delta1(int *delta1, uint8_t *pat, int32_t patlen) {
+    int i;
+    for (i=0; i < ALPHABET_LEN; i++) {
+        delta1[i] = NOT_FOUND;
+    }
+    for (i=0; i < patlen-1; i++) {
+        delta1[pat[i]] = patlen-1 - i;
+    }
+}
+
+// true if the suffix of word starting from word[pos] is a prefix
+// of word
+int is_prefix(uint8_t *word, int wordlen, int pos) {
+    int i;
+    int suffixlen = wordlen - pos;
+    // could also use the strncmp() library function here
+    for (i = 0; i < suffixlen; i++) {
+        if (word[i] != word[pos+i]) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+// length of the longest suffix of word ending on word[pos].
+// suffix_length("dddbcabc", 8, 4) = 2
+int suffix_length(uint8_t *word, int wordlen, int pos) {
+    int i;
+    // increment suffix length i to the first mismatch or beginning
+    // of the word
+    for (i = 0; (word[pos-i] == word[wordlen-1-i]) && (i < pos); i++);
+    return i;
+}
+
+// delta2 table: given a mismatch at pat[pos], we want to align
+// with the next possible full match could be based on what we
+// know about pat[pos+1] to pat[patlen-1].
+//
+// In case 1:
+// pat[pos+1] to pat[patlen-1] does not occur elsewhere in pat,
+// the next plausible match starts at or after the mismatch.
+// If, within the substring pat[pos+1 .. patlen-1], lies a prefix
+// of pat, the next plausible match is here (if there are multiple
+// prefixes in the substring, pick the longest). Otherwise, the
+// next plausible match starts past the character aligned with
+// pat[patlen-1].
+//
+// In case 2:
+// pat[pos+1] to pat[patlen-1] does occur elsewhere in pat. The
+// mismatch tells us that we are not looking at the end of a match.
+// We may, however, be looking at the middle of a match.
+//
+// The first loop, which takes care of case 1, is analogous to
+// the KMP table, adapted for a 'backwards' scan order with the
+// additional restriction that the substrings it considers as
+// potential prefixes are all suffixes. In the worst case scenario
+// pat consists of the same letter repeated, so every suffix is
+// a prefix. This loop alone is not sufficient, however:
+// Suppose that pat is "ABYXCDBYX", and text is ".....ABYXCDEYX".
+// We will match X, Y, and find B != E. There is no prefix of pat
+// in the suffix "YX", so the first loop tells us to skip forward
+// by 9 characters.
+// Although superficially similar to the KMP table, the KMP table
+// relies on information about the beginning of the partial match
+// that the BM algorithm does not have.
+//
+// The second loop addresses case 2. Since suffix_length may not be
+// unique, we want to take the minimum value, which will tell us
+// how far away the closest potential match is.
+void make_delta2(int *delta2, uint8_t *pat, int32_t patlen) {
+    int p;
+    int last_prefix_index = patlen-1;
+
+    // first loop
+    for (p=patlen-1; p>=0; p--) {
+        if (is_prefix(pat, patlen, p+1)) {
+            last_prefix_index = p+1;
+        }
+        delta2[p] = last_prefix_index + (patlen-1 - p);
+    }
+
+    // second loop
+    for (p=0; p < patlen-1; p++) {
+        int slen = suffix_length(pat, patlen, p);
+        if (pat[p - slen] != pat[patlen-1 - slen]) {
+            delta2[patlen-1 - slen] = patlen-1 - p + slen;
+        }
+    }
+}
+
+int boyer_moore (uint8_t *string, uint32_t stringlen, uint8_t *pat, uint32_t patlen) {
+    int i;
+    int delta1[ALPHABET_LEN];
+    int *delta2 = (int *)malloc(patlen * sizeof(int));
+    make_delta1(delta1, pat, patlen);
+    make_delta2(delta2, pat, patlen);
+
+    // The empty pattern must be considered specially
+    if (patlen == 0) {
+        free(delta2);
+        return 0;
+    }
+
+    i = patlen-1;
+    while (i < stringlen) {
+        int j = patlen-1;
+        while (j >= 0 && (string[i] == pat[j])) {
+            --i;
+            --j;
+        }
+        if (j < 0) {
+            free(delta2);
+            return i+1;
+        }
+
+        i += max(delta1[string[i]], delta2[j]);
+    }
+    free(delta2);
+    return -1;
+}
+
 /* Returns the location of one string in another or -1  */
 MVMint64 MVM_string_index(MVMThreadContext *tc, MVMString *haystack, MVMString *needle, MVMint64 start) {
     MVMint64 result        = -1;
@@ -217,13 +353,25 @@ MVMint64 MVM_string_index(MVMThreadContext *tc, MVMString *haystack, MVMString *
     if (ngraphs > hgraphs || ngraphs < 1)
         return -1;

-    /* brute force for now. horrible, yes. halp. */
-    while (index <= hgraphs - ngraphs) {
-        if (MVM_string_substrings_equal_nocheck(tc, needle, 0, ngraphs, haystack, index)) {
-            result = (MVMint64)index;
-            break;
+    if (haystack->body.storage_type == MVM_STRING_GRAPHEME_8 && needle->body.storage_type == MVM_STRING_GRAPHEME_8) {
+        if (start < hgraphs) {
+            result = (MVMint64)boyer_moore(haystack->body.storage.blob_8 + start, hgraphs - start, needle->body.storage.blob_8, ngraphs);
+            if (result > -1) result += start;
+            if (result > 500) printf("r: %i, s: %i\n", result, start);
+        }
+        else {
+            result = -1;
+        }
+    }
+    else {
+        /* brute force for now. horrible, yes. halp. */
+        while (index <= hgraphs - ngraphs) {
+            if (MVM_string_substrings_equal_nocheck(tc, needle, 0, ngraphs, haystack, index)) {
+                result = (MVMint64)index;
+                break;
+            }
+            index++;
         }
-        index++;
     }
     return result;
 }
	diff --git a/src/strings/ops.c b/src/strings/ops.c
	index 72a30b9..9db71af 100644
	--- a/src/strings/ops.c
	+++ b/src/strings/ops.c
	@@ -1,4 +1,11 @@
	#include "moar.h"
	+#include <stdint.h>
	+#include <stdlib.h>
	+
	+#define ALPHABET_LEN 256
	+#define NOT_FOUND patlen
	+
	+#define max(a, b) (((a) < (b)) ? (b) : (a))

	#define MVM_DEBUG_STRANDS 0

	@@ -196,6 +203,135 @@ MVMGrapheme32 MVM_string_get_grapheme_at_nocheck(MVMThreadContext *tc, MVMString
	}
	}

	+// delta1 table: delta1[c] contains the distance between the last
	+// character of pat and the rightmost occurrence of c in pat.
	+// If c does not occur in pat, then delta1[c] = patlen.
	+// If c is at string[i] and c != pat[patlen-1], we can
	+// safely shift i over by delta1[c], which is the minimum distance
	+// needed to shift pat forward to get string[i] lined up
	+// with some character in pat.
	+// this algorithm runs in alphabet_len+patlen time.
	+void make_delta1(int delta1, uint8_t pat, int32_t patlen) {
	+ int i;
	+ for (i=0; i < ALPHABET_LEN; i++) {
	+ delta1[i] = NOT_FOUND;
	+ }
	+ for (i=0; i < patlen-1; i++) {
	+ delta1[pat[i]] = patlen-1 - i;
	+ }
	+}
	+
	+// true if the suffix of word starting from word[pos] is a prefix
	+// of word
	+int is_prefix(uint8_t *word, int wordlen, int pos) {
	+ int i;
	+ int suffixlen = wordlen - pos;
	+ // could also use the strncmp() library function here
	+ for (i = 0; i < suffixlen; i++) {
	+ if (word[i] != word[pos+i]) {
	+ return 0;
	+ }
	+ }
	+ return 1;
	+}
	+
	+// length of the longest suffix of word ending on word[pos].
	+// suffix_length("dddbcabc", 8, 4) = 2
	+int suffix_length(uint8_t *word, int wordlen, int pos) {
	+ int i;
	+ // increment suffix length i to the first mismatch or beginning
	+ // of the word
	+ for (i = 0; (word[pos-i] == word[wordlen-1-i]) && (i < pos); i++);
	+ return i;
	+}
	+
	+// delta2 table: given a mismatch at pat[pos], we want to align
	+// with the next possible full match could be based on what we
	+// know about pat[pos+1] to pat[patlen-1].
	+//
	+// In case 1:
	+// pat[pos+1] to pat[patlen-1] does not occur elsewhere in pat,
	+// the next plausible match starts at or after the mismatch.
	+// If, within the substring pat[pos+1 .. patlen-1], lies a prefix
	+// of pat, the next plausible match is here (if there are multiple
	+// prefixes in the substring, pick the longest). Otherwise, the
	+// next plausible match starts past the character aligned with
	+// pat[patlen-1].
	+//
	+// In case 2:
	+// pat[pos+1] to pat[patlen-1] does occur elsewhere in pat. The
	+// mismatch tells us that we are not looking at the end of a match.
	+// We may, however, be looking at the middle of a match.
	+//
	+// The first loop, which takes care of case 1, is analogous to
	+// the KMP table, adapted for a 'backwards' scan order with the
	+// additional restriction that the substrings it considers as
	+// potential prefixes are all suffixes. In the worst case scenario
	+// pat consists of the same letter repeated, so every suffix is
	+// a prefix. This loop alone is not sufficient, however:
	+// Suppose that pat is "ABYXCDBYX", and text is ".....ABYXCDEYX".
	+// We will match X, Y, and find B != E. There is no prefix of pat
	+// in the suffix "YX", so the first loop tells us to skip forward
	+// by 9 characters.
	+// Although superficially similar to the KMP table, the KMP table
	+// relies on information about the beginning of the partial match
	+// that the BM algorithm does not have.
	+//
	+// The second loop addresses case 2. Since suffix_length may not be
	+// unique, we want to take the minimum value, which will tell us
	+// how far away the closest potential match is.
	+void make_delta2(int delta2, uint8_t pat, int32_t patlen) {
	+ int p;
	+ int last_prefix_index = patlen-1;
	+
	+ // first loop
	+ for (p=patlen-1; p>=0; p--) {
	+ if (is_prefix(pat, patlen, p+1)) {
	+ last_prefix_index = p+1;
	+ }
	+ delta2[p] = last_prefix_index + (patlen-1 - p);
	+ }
	+
	+ // second loop
	+ for (p=0; p < patlen-1; p++) {
	+ int slen = suffix_length(pat, patlen, p);
	+ if (pat[p - slen] != pat[patlen-1 - slen]) {
	+ delta2[patlen-1 - slen] = patlen-1 - p + slen;
	+ }
	+ }
	+}
	+
	+int boyer_moore (uint8_t string, uint32_t stringlen, uint8_t pat, uint32_t patlen) {
	+ int i;
	+ int delta1[ALPHABET_LEN];
	+ int delta2 = (int )malloc(patlen * sizeof(int));
	+ make_delta1(delta1, pat, patlen);
	+ make_delta2(delta2, pat, patlen);
	+
	+ // The empty pattern must be considered specially
	+ if (patlen == 0) {
	+ free(delta2);
	+ return 0;
	+ }
	+
	+ i = patlen-1;
	+ while (i < stringlen) {
	+ int j = patlen-1;
	+ while (j >= 0 && (string[i] == pat[j])) {
	+ --i;
	+ --j;
	+ }
	+ if (j < 0) {
	+ free(delta2);
	+ return i+1;
	+ }
	+
	+ i += max(delta1[string[i]], delta2[j]);
	+ }
	+ free(delta2);
	+ return -1;
	+}
	+
	/* Returns the location of one string in another or -1 */
	MVMint64 MVM_string_index(MVMThreadContext tc, MVMString haystack, MVMString *needle, MVMint64 start) {
	MVMint64 result = -1;
	@@ -217,13 +353,25 @@ MVMint64 MVM_string_index(MVMThreadContext tc, MVMString haystack, MVMString *
	if (ngraphs > hgraphs \|\| ngraphs < 1)
	return -1;

	- /* brute force for now. horrible, yes. halp. */
	- while (index <= hgraphs - ngraphs) {
	- if (MVM_string_substrings_equal_nocheck(tc, needle, 0, ngraphs, haystack, index)) {
	- result = (MVMint64)index;
	- break;
	+ if (haystack->body.storage_type == MVM_STRING_GRAPHEME_8 && needle->body.storage_type == MVM_STRING_GRAPHEME_8) {
	+ if (start < hgraphs) {
	+ result = (MVMint64)boyer_moore(haystack->body.storage.blob_8 + start, hgraphs - start, needle->body.storage.blob_8, ngraphs);
	+ if (result > -1) result += start;
	+ if (result > 500) printf("r: %i, s: %i\n", result, start);
	+ }
	+ else {
	+ result = -1;
	+ }
	+ }
	+ else {
	+ /* brute force for now. horrible, yes. halp. */
	+ while (index <= hgraphs - ngraphs) {
	+ if (MVM_string_substrings_equal_nocheck(tc, needle, 0, ngraphs, haystack, index)) {
	+ result = (MVMint64)index;
	+ break;
	+ }
	+ index++;
	}
	- index++;
	}
	return result;
	}