Skip to content

Instantly share code, notes, and snippets.

@MasterDuke17

MasterDuke17/p Secret

Last active December 11, 2016 20:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MasterDuke17/51dba071c63ee1a6989d05e7c4bbf29a to your computer and use it in GitHub Desktop.
Save MasterDuke17/51dba071c63ee1a6989d05e7c4bbf29a to your computer and use it in GitHub Desktop.
diff --git a/src/strings/ops.c b/src/strings/ops.c
index 72a30b9..9db71af 100644
--- a/src/strings/ops.c
+++ b/src/strings/ops.c
@@ -1,4 +1,11 @@
#include "moar.h"
+#include <stdint.h>
+#include <stdlib.h>
+
+#define ALPHABET_LEN 256
+#define NOT_FOUND patlen
+
+#define max(a, b) (((a) < (b)) ? (b) : (a))
#define MVM_DEBUG_STRANDS 0
@@ -196,6 +203,135 @@ MVMGrapheme32 MVM_string_get_grapheme_at_nocheck(MVMThreadContext *tc, MVMString
}
}
+// delta1 table: delta1[c] contains the distance between the last
+// character of pat and the rightmost occurrence of c in pat.
+// If c does not occur in pat, then delta1[c] = patlen.
+// If c is at string[i] and c != pat[patlen-1], we can
+// safely shift i over by delta1[c], which is the minimum distance
+// needed to shift pat forward to get string[i] lined up
+// with some character in pat.
+// this algorithm runs in alphabet_len+patlen time.
+void make_delta1(int *delta1, uint8_t *pat, int32_t patlen) {
+ int i;
+ for (i=0; i < ALPHABET_LEN; i++) {
+ delta1[i] = NOT_FOUND;
+ }
+ for (i=0; i < patlen-1; i++) {
+ delta1[pat[i]] = patlen-1 - i;
+ }
+}
+
+// true if the suffix of word starting from word[pos] is a prefix
+// of word
+int is_prefix(uint8_t *word, int wordlen, int pos) {
+ int i;
+ int suffixlen = wordlen - pos;
+ // could also use the strncmp() library function here
+ for (i = 0; i < suffixlen; i++) {
+ if (word[i] != word[pos+i]) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+// length of the longest suffix of word ending on word[pos].
+// suffix_length("dddbcabc", 8, 4) = 2
+int suffix_length(uint8_t *word, int wordlen, int pos) {
+ int i;
+ // increment suffix length i to the first mismatch or beginning
+ // of the word
+ for (i = 0; (word[pos-i] == word[wordlen-1-i]) && (i < pos); i++);
+ return i;
+}
+
+// delta2 table: given a mismatch at pat[pos], we want to align
+// with the next possible full match could be based on what we
+// know about pat[pos+1] to pat[patlen-1].
+//
+// In case 1:
+// pat[pos+1] to pat[patlen-1] does not occur elsewhere in pat,
+// the next plausible match starts at or after the mismatch.
+// If, within the substring pat[pos+1 .. patlen-1], lies a prefix
+// of pat, the next plausible match is here (if there are multiple
+// prefixes in the substring, pick the longest). Otherwise, the
+// next plausible match starts past the character aligned with
+// pat[patlen-1].
+//
+// In case 2:
+// pat[pos+1] to pat[patlen-1] does occur elsewhere in pat. The
+// mismatch tells us that we are not looking at the end of a match.
+// We may, however, be looking at the middle of a match.
+//
+// The first loop, which takes care of case 1, is analogous to
+// the KMP table, adapted for a 'backwards' scan order with the
+// additional restriction that the substrings it considers as
+// potential prefixes are all suffixes. In the worst case scenario
+// pat consists of the same letter repeated, so every suffix is
+// a prefix. This loop alone is not sufficient, however:
+// Suppose that pat is "ABYXCDBYX", and text is ".....ABYXCDEYX".
+// We will match X, Y, and find B != E. There is no prefix of pat
+// in the suffix "YX", so the first loop tells us to skip forward
+// by 9 characters.
+// Although superficially similar to the KMP table, the KMP table
+// relies on information about the beginning of the partial match
+// that the BM algorithm does not have.
+//
+// The second loop addresses case 2. Since suffix_length may not be
+// unique, we want to take the minimum value, which will tell us
+// how far away the closest potential match is.
+void make_delta2(int *delta2, uint8_t *pat, int32_t patlen) {
+ int p;
+ int last_prefix_index = patlen-1;
+
+ // first loop
+ for (p=patlen-1; p>=0; p--) {
+ if (is_prefix(pat, patlen, p+1)) {
+ last_prefix_index = p+1;
+ }
+ delta2[p] = last_prefix_index + (patlen-1 - p);
+ }
+
+ // second loop
+ for (p=0; p < patlen-1; p++) {
+ int slen = suffix_length(pat, patlen, p);
+ if (pat[p - slen] != pat[patlen-1 - slen]) {
+ delta2[patlen-1 - slen] = patlen-1 - p + slen;
+ }
+ }
+}
+
+int boyer_moore (uint8_t *string, uint32_t stringlen, uint8_t *pat, uint32_t patlen) {
+ int i;
+ int delta1[ALPHABET_LEN];
+ int *delta2 = (int *)malloc(patlen * sizeof(int));
+ make_delta1(delta1, pat, patlen);
+ make_delta2(delta2, pat, patlen);
+
+ // The empty pattern must be considered specially
+ if (patlen == 0) {
+ free(delta2);
+ return 0;
+ }
+
+ i = patlen-1;
+ while (i < stringlen) {
+ int j = patlen-1;
+ while (j >= 0 && (string[i] == pat[j])) {
+ --i;
+ --j;
+ }
+ if (j < 0) {
+ free(delta2);
+ return i+1;
+ }
+
+ i += max(delta1[string[i]], delta2[j]);
+ }
+ free(delta2);
+ return -1;
+}
+
/* Returns the location of one string in another or -1 */
MVMint64 MVM_string_index(MVMThreadContext *tc, MVMString *haystack, MVMString *needle, MVMint64 start) {
MVMint64 result = -1;
@@ -217,13 +353,25 @@ MVMint64 MVM_string_index(MVMThreadContext *tc, MVMString *haystack, MVMString *
if (ngraphs > hgraphs || ngraphs < 1)
return -1;
- /* brute force for now. horrible, yes. halp. */
- while (index <= hgraphs - ngraphs) {
- if (MVM_string_substrings_equal_nocheck(tc, needle, 0, ngraphs, haystack, index)) {
- result = (MVMint64)index;
- break;
+ if (haystack->body.storage_type == MVM_STRING_GRAPHEME_8 && needle->body.storage_type == MVM_STRING_GRAPHEME_8) {
+ if (start < hgraphs) {
+ result = (MVMint64)boyer_moore(haystack->body.storage.blob_8 + start, hgraphs - start, needle->body.storage.blob_8, ngraphs);
+ if (result > -1) result += start;
+ if (result > 500) printf("r: %i, s: %i\n", result, start);
+ }
+ else {
+ result = -1;
+ }
+ }
+ else {
+ /* brute force for now. horrible, yes. halp. */
+ while (index <= hgraphs - ngraphs) {
+ if (MVM_string_substrings_equal_nocheck(tc, needle, 0, ngraphs, haystack, index)) {
+ result = (MVMint64)index;
+ break;
+ }
+ index++;
}
- index++;
}
return result;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment