Created
December 4, 2011 17:19
-
-
Save shoenig/1430733 to your computer and use it in GitHub Desktop.
Knuth-Morris-Pratt String Searching in Java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public class KMP { | |
public static void main(String abc[]) { | |
System.out.println("Testing KMP"); | |
test("abc", "a", 0); | |
test("abc", "b", 1); | |
test("abc", "c", 2); | |
test("abc", "d", -1); | |
test("catdog", "tdo", 2); | |
test("ratatat", "at", 1); | |
test("foo", "", 0); | |
test("", "bar", -1); | |
} | |
public static void test(String text, String word, int exp) { | |
char[] textC = text.toCharArray(); | |
char[] wordC = word.toCharArray(); | |
int result = kmp(textC, wordC); | |
if(result == exp) | |
System.out.println("PASSED"); | |
else { | |
System.out.println("FAILED"); | |
System.out.println("\ttext: " + text); | |
System.out.println("\tword: " + word); | |
System.out.println("\texp: " + exp + ", res: " + result); | |
} | |
} | |
// W := word | |
public static Integer[] createTable(char[] W) { | |
Integer[] table = new Integer[W.length]; | |
int pos = 2; // cur pos to compute in T | |
int cnd = 0; // index of W of next character of cur candidate substr | |
// first few values are fixed | |
table[0] = -1; // table[0] := -1 | |
table[1] = 0; // table[1] := 0 | |
while(pos < W.length) { | |
// first case: substring is still good | |
if(W[pos-1] == W[cnd]) { | |
table[pos] = cnd; | |
cnd += 1; | |
pos += 1; | |
} else if(cnd > 0) | |
cnd = table[cnd]; | |
else { | |
table[pos] = 0; | |
pos += 1; | |
} | |
} | |
return table; | |
} | |
// S := text string | |
// W := word | |
public static int kmp(char[] S, char[] W) { | |
if(W.length == 0) // substr is empty string | |
return 0; | |
if(S.length == 0) // text is empty, can't be found | |
return -1; | |
int m = 0; // index of beg. of current match in S | |
int i = 0; // pos. of cur char in W | |
Integer[] T = createTable(S); | |
while(m+i < S.length) { | |
if(W[i] == S[m+i]) { | |
if(i == W.length-1) | |
return m; | |
i += 1; | |
} else { | |
m = (m+i - T[i]); | |
if(T[i] > -1) | |
i = T[i]; | |
else | |
i = 0; | |
} | |
} | |
return -1; | |
} | |
} |
I have run a microbenchmark to compare the performance between KMP search and String.indexOf(CharSequence)
. Unfortunately the result show KMP search is way behind String.indexOf
in both short and long text search cases:
test scenario | String | KMP | KMP - precompiled |
---|---|---|---|
short text | 78,763.767/ms |
19,118.220/ms |
29,489.667/ms |
long text | 4,093.381/ms |
329.124/ms |
426.696/ms |
For details please check out the benchmark project: https://github.com/greenlaw110/java-str-benchmark/tree/string-match-algorithms
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
A more production-ready version
One more goodie :)
Unit tests (Scala)