Created
December 4, 2011 17:19
-
-
Save shoenig/1430733 to your computer and use it in GitHub Desktop.
Knuth-Morris-Pratt String Searching in Java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.ArrayList; | |
public class KMP { | |
public static void main(String abc[]) { | |
System.out.println("Testing KMP"); | |
test("abc", "a", 0); | |
test("abc", "b", 1); | |
test("abc", "c", 2); | |
test("abc", "d", -1); | |
test("catdog", "tdo", 2); | |
test("ratatat", "at", 1); | |
test("foo", "", 0); | |
test("", "bar", -1); | |
} | |
public static void test(String text, String word, int exp) { | |
char[] textC = text.toCharArray(); | |
char[] wordC = word.toCharArray(); | |
int result = kmp(textC, wordC); | |
if(result == exp) | |
System.out.println("PASSED"); | |
else { | |
System.out.println("FAILED"); | |
System.out.println("\ttext: " + text); | |
System.out.println("\tword: " + word); | |
System.out.println("\texp: " + exp + ", res: " + result); | |
} | |
} | |
// W := word | |
public static Integer[] createTable(char[] W) { | |
Integer[] table = new Integer[W.length]; | |
int pos = 2; // cur pos to compute in T | |
int cnd = 0; // index of W of next character of cur candidate substr | |
// first few values are fixed | |
table[0] = -1; // table[0] := -1 | |
table[1] = 0; // table[1] := 0 | |
while(pos < W.length) { | |
// first case: substring is still good | |
if(W[pos-1] == W[cnd]) { | |
table[pos] = cnd; | |
cnd += 1; | |
pos += 1; | |
} else if(cnd > 0) | |
cnd = table[cnd]; | |
else { | |
table[pos] = 0; | |
pos += 1; | |
} | |
} | |
return table; | |
} | |
// S := text string | |
// W := word | |
public static int kmp(char[] S, char[] W) { | |
if(W.length == 0) // substr is empty string | |
return 0; | |
if(S.length == 0) // text is empty, can't be found | |
return -1; | |
int m = 0; // index of beg. of current match in S | |
int i = 0; // pos. of cur char in W | |
Integer[] T = createTable(S); | |
while(m+i < S.length) { | |
if(W[i] == S[m+i]) { | |
if(i == W.length-1) | |
return m; | |
i += 1; | |
} else { | |
m = (m+i - T[i]); | |
if(T[i] > -1) | |
i = T[i]; | |
else | |
i = 0; | |
} | |
} | |
return -1; | |
} | |
} |
@icemanftg you are right.
Infact, to verify that, just add the test
test("pipippo", "pippo", 2);
A more production-ready version
// https://gist.github.com/shoenig/1430733/250b4184dc4a2dd31aa136e2fbdded5f90489a64
public class KMP {
private Integer[] table;
private char[] pattern;
public KMP(String pattern) {
this.pattern = pattern.toCharArray();
char[] w = this.pattern;
table = new Integer[w.length < 2 ? 2 : w.length];
int pos = 2; // cur pos to compute in T
int cnd = 0; // index of W of next character of cur candidate substr
// first few values are fixed
table[0] = -1; // table[0] := -1
table[1] = 0; // table[1] := 0
while (pos < w.length) {
// first case: substring is still good
if (w[pos - 1] == w[cnd]) {
cnd += 1;
table[pos] = cnd;
pos += 1;
} else if (cnd > 0)
cnd = table[cnd];
else {
table[pos] = 0;
pos += 1;
}
}
}
/** Has the same semantics as str.indexOf(pattern) */
public int search(String str) {
char[] s = str.toCharArray();
char[] w = this.pattern;
if (w.length == 0) // substr is empty string
return 0;
if (s.length == 0) // text is empty, can't be found
return -1;
int m = 0; // index of beg. of current match in S
int i = 0; // pos. of cur char in W
while (m + i < s.length) {
if (w[i] == s[m + i]) {
if (i == w.length - 1)
return m;
i += 1;
} else {
m = (m + i - table[i]);
if (table[i] > -1)
i = table[i];
else
i = 0;
}
}
return -1;
}
}
One more goodie :)
// https://gist.github.com/shoenig/1430733/250b4184dc4a2dd31aa136e2fbdded5f90489a64
public class KMPIgnoreCase {
private Integer[] table;
private char[] pattern;
public KMPIgnoreCase(String pattern) {
this.pattern = pattern.toLowerCase().toCharArray();
char[] w = this.pattern;
table = new Integer[w.length < 2 ? 2 : w.length];
int pos = 2; // cur pos to compute in T
int cnd = 0; // index of W of next character of cur candidate substr
// first few values are fixed
table[0] = -1; // table[0] := -1
table[1] = 0; // table[1] := 0
while (pos < w.length) {
// first case: substring is still good
if (w[pos - 1] == w[cnd]) {
cnd += 1;
table[pos] = cnd;
pos += 1;
} else if (cnd > 0)
cnd = table[cnd];
else {
table[pos] = 0;
pos += 1;
}
}
}
/**
* Has the same semantics as str.indexOf(pattern), but performs searching in a case-insensitive manner
*/
public int searchIgnoreCase(String str) {
char[] s = str.toCharArray();
char[] w = this.pattern;
if (w.length == 0) // substr is empty string
return 0;
if (s.length == 0) // text is empty, can't be found
return -1;
int m = 0; // index of beg. of current match in S
int i = 0; // pos. of cur char in W
while (m + i < s.length) {
if (w[i] == Character.toLowerCase(s[m + i])) {
if (i == w.length - 1)
return m;
i += 1;
} else {
m = (m + i - table[i]);
if (table[i] > -1)
i = table[i];
else
i = 0;
}
}
return -1;
}
}
Unit tests (Scala)
import org.scalatest.FunSuite
import org.scalatest.prop.GeneratorDrivenPropertyChecks
class KMPTest extends FunSuite with GeneratorDrivenPropertyChecks {
private[this] def testKMP(pattern: String, s: String): Unit = {
val kmp = new KMP(pattern)
assertResult(s.indexOf(pattern))(kmp.search(s))
}
private[this] def testKMPIgnoreCase(pattern: String, s: String): Unit = {
val kmp = new KMPIgnoreCase(pattern)
assertResult(s.toLowerCase.indexOf(pattern.toLowerCase()))(kmp.searchIgnoreCase(s))
}
test("search") {
forAll { (pattern: String, s: String) ⇒
testKMP(pattern, s)
}
}
test("searchIgnoreCase") {
forAll { (pattern: String, s: String) ⇒
testKMPIgnoreCase(pattern, s)
}
}
}
I have run a microbenchmark to compare the performance between KMP search and String.indexOf(CharSequence)
. Unfortunately the result show KMP search is way behind String.indexOf
in both short and long text search cases:
test scenario | String | KMP | KMP - precompiled |
---|---|---|---|
short text | 78,763.767/ms |
19,118.220/ms |
29,489.667/ms |
long text | 4,093.381/ms |
329.124/ms |
426.696/ms |
For details please check out the benchmark project: https://github.com/greenlaw110/java-str-benchmark/tree/string-match-algorithms
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The lines
table[pos] = cnd;
cnd += 1;
sould be reversed.