Skip to content

Instantly share code, notes, and snippets.

@shoenig
Created December 4, 2011 17:19
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save shoenig/1430733 to your computer and use it in GitHub Desktop.
Save shoenig/1430733 to your computer and use it in GitHub Desktop.
Knuth-Morris-Pratt String Searching in Java
import java.util.ArrayList;
public class KMP {
public static void main(String abc[]) {
System.out.println("Testing KMP");
test("abc", "a", 0);
test("abc", "b", 1);
test("abc", "c", 2);
test("abc", "d", -1);
test("catdog", "tdo", 2);
test("ratatat", "at", 1);
test("foo", "", 0);
test("", "bar", -1);
}
public static void test(String text, String word, int exp) {
char[] textC = text.toCharArray();
char[] wordC = word.toCharArray();
int result = kmp(textC, wordC);
if(result == exp)
System.out.println("PASSED");
else {
System.out.println("FAILED");
System.out.println("\ttext: " + text);
System.out.println("\tword: " + word);
System.out.println("\texp: " + exp + ", res: " + result);
}
}
// W := word
public static Integer[] createTable(char[] W) {
Integer[] table = new Integer[W.length];
int pos = 2; // cur pos to compute in T
int cnd = 0; // index of W of next character of cur candidate substr
// first few values are fixed
table[0] = -1; // table[0] := -1
table[1] = 0; // table[1] := 0
while(pos < W.length) {
// first case: substring is still good
if(W[pos-1] == W[cnd]) {
table[pos] = cnd;
cnd += 1;
pos += 1;
} else if(cnd > 0)
cnd = table[cnd];
else {
table[pos] = 0;
pos += 1;
}
}
return table;
}
// S := text string
// W := word
public static int kmp(char[] S, char[] W) {
if(W.length == 0) // substr is empty string
return 0;
if(S.length == 0) // text is empty, can't be found
return -1;
int m = 0; // index of beg. of current match in S
int i = 0; // pos. of cur char in W
Integer[] T = createTable(S);
while(m+i < S.length) {
if(W[i] == S[m+i]) {
if(i == W.length-1)
return m;
i += 1;
} else {
m = (m+i - T[i]);
if(T[i] > -1)
i = T[i];
else
i = 0;
}
}
return -1;
}
}
@aburghelea
Copy link

The lines
table[pos] = cnd;
cnd += 1;

sould be reversed.

@simonegiacomelli
Copy link

simonegiacomelli commented May 16, 2016

@icemanftg you are right.
Infact, to verify that, just add the test
test("pipippo", "pippo", 2);

@dvtomas
Copy link

dvtomas commented Dec 6, 2016

A more production-ready version

// https://gist.github.com/shoenig/1430733/250b4184dc4a2dd31aa136e2fbdded5f90489a64
public class KMP {
    private Integer[] table;
    private char[] pattern;

    public KMP(String pattern) {
        this.pattern = pattern.toCharArray();
        char[] w = this.pattern;

        table = new Integer[w.length < 2 ? 2 : w.length];
        int pos = 2; // cur pos to compute in T
        int cnd = 0; // index of W of next character of cur candidate substr

        // first few values are fixed
        table[0] = -1;  // table[0] := -1
        table[1] = 0;   // table[1] := 0

        while (pos < w.length) {
            // first case: substring is still good
            if (w[pos - 1] == w[cnd]) {
                cnd += 1;
                table[pos] = cnd;
                pos += 1;
            } else if (cnd > 0)
                cnd = table[cnd];
            else {
                table[pos] = 0;
                pos += 1;
            }
        }
    }

    /** Has the same semantics as str.indexOf(pattern) */
    public int search(String str) {
        char[] s = str.toCharArray();
        char[] w = this.pattern;

        if (w.length == 0) // substr is empty string
            return 0;
        if (s.length == 0) // text is empty, can't be found
            return -1;
        int m = 0; // index of beg. of current match in S
        int i = 0; // pos. of cur char in W

        while (m + i < s.length) {
            if (w[i] == s[m + i]) {
                if (i == w.length - 1)
                    return m;
                i += 1;
            } else {
                m = (m + i - table[i]);
                if (table[i] > -1)
                    i = table[i];
                else
                    i = 0;
            }
        }
        return -1;
    }
}

One more goodie :)

// https://gist.github.com/shoenig/1430733/250b4184dc4a2dd31aa136e2fbdded5f90489a64
public class KMPIgnoreCase {
    private Integer[] table;
    private char[] pattern;

    public KMPIgnoreCase(String pattern) {
        this.pattern = pattern.toLowerCase().toCharArray();
        char[] w = this.pattern;

        table = new Integer[w.length < 2 ? 2 : w.length];
        int pos = 2; // cur pos to compute in T
        int cnd = 0; // index of W of next character of cur candidate substr

        // first few values are fixed
        table[0] = -1;  // table[0] := -1
        table[1] = 0;   // table[1] := 0

        while (pos < w.length) {
            // first case: substring is still good
            if (w[pos - 1] == w[cnd]) {
                cnd += 1;
                table[pos] = cnd;
                pos += 1;
            } else if (cnd > 0)
                cnd = table[cnd];
            else {
                table[pos] = 0;
                pos += 1;
            }
        }
    }

    /**
     * Has the same semantics as str.indexOf(pattern), but performs searching in a case-insensitive manner
     */
    public int searchIgnoreCase(String str) {
        char[] s = str.toCharArray();
        char[] w = this.pattern;

        if (w.length == 0) // substr is empty string
            return 0;
        if (s.length == 0) // text is empty, can't be found
            return -1;
        int m = 0; // index of beg. of current match in S
        int i = 0; // pos. of cur char in W

        while (m + i < s.length) {
            if (w[i] == Character.toLowerCase(s[m + i])) {
                if (i == w.length - 1)
                    return m;
                i += 1;
            } else {
                m = (m + i - table[i]);
                if (table[i] > -1)
                    i = table[i];
                else
                    i = 0;
            }
        }
        return -1;
    }
}

Unit tests (Scala)

import org.scalatest.FunSuite
import org.scalatest.prop.GeneratorDrivenPropertyChecks

class KMPTest extends FunSuite with GeneratorDrivenPropertyChecks {

  private[this] def testKMP(pattern: String, s: String): Unit = {
    val kmp = new KMP(pattern)
    assertResult(s.indexOf(pattern))(kmp.search(s))
  }

  private[this] def testKMPIgnoreCase(pattern: String, s: String): Unit = {
    val kmp = new KMPIgnoreCase(pattern)
    assertResult(s.toLowerCase.indexOf(pattern.toLowerCase()))(kmp.searchIgnoreCase(s))
  }

  test("search") {
    forAll { (pattern: String, s: String) 
      testKMP(pattern, s)
    }
  }

  test("searchIgnoreCase") {
    forAll { (pattern: String, s: String) 
      testKMPIgnoreCase(pattern, s)
    }
  }

}

@greenlaw110
Copy link

greenlaw110 commented Aug 15, 2017

I have run a microbenchmark to compare the performance between KMP search and String.indexOf(CharSequence). Unfortunately the result show KMP search is way behind String.indexOf in both short and long text search cases:

test scenario String KMP KMP - precompiled
short text 78,763.767/ms 19,118.220/ms 29,489.667/ms
long text 4,093.381/ms 329.124/ms 426.696/ms

For details please check out the benchmark project: https://github.com/greenlaw110/java-str-benchmark/tree/string-match-algorithms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment