Construct suffix array by SA-IS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"fmt" | |
"os" | |
"strconv" | |
"strings" | |
"math" | |
) | |
// --- | |
// ref. Two Efficient Algorithms for Linear Time Suffix Array Construction | |
// --- | |
type text []rune | |
// --- | |
// Define suffixArray | |
// --- | |
type suffixArray []int | |
func newSuffixArray(n int) suffixArray { | |
sa := make(suffixArray, n) | |
sa.init() | |
return sa | |
} | |
func (sa suffixArray) init() { | |
for i := 0; i < len(sa); i++ { | |
sa[i] = -1 | |
} | |
} | |
// --- | |
// Define buckets | |
// --- | |
type buckets []int | |
// --- | |
// Define suffix type, S or L | |
// --- | |
type suffixType bool | |
type suffixTypes []suffixType // true: S, false: L | |
const ( | |
S suffixType = true | |
L suffixType = false | |
) | |
func (st suffixType) String() string { | |
if (st) { | |
return "S" | |
} else { | |
return "L" | |
} | |
} | |
func (sts suffixTypes) isLMS(i int) bool { | |
if i == 0 { | |
return false | |
} | |
return sts[i-1] == L && sts[i] == S | |
} | |
// Generate text from string. | |
// This function adds sentinel at the end of text. | |
func newTextFromString(raw string) text { | |
n := len(raw) | |
s := make(text, n+1) // +1 for sentinel | |
for i := 0; i < n; i++ { | |
s[i] = rune(raw[i]) | |
} | |
s[n] = 0 | |
return s | |
} | |
// Generate text from []rune. | |
// This function adds sentinel at the end of text. | |
func newTextFromRunes(runes []rune) text { | |
n := len(runes) | |
s := make(text, n+1) // +1 for sentinel | |
for i := 0; i < n; i++ { | |
s[i] = runes[i] | |
} | |
s[n] = 0 | |
return s | |
} | |
// Generate text from []int. | |
// This function does not add sentinel. | |
func newTextFromInts(ints []int) text { | |
n := len(ints) | |
s := make(text, n) | |
for i := 0; i < n; i++ { | |
s[i] = rune(ints[i]) | |
} | |
return s | |
} | |
func classifyTypes(s text) suffixTypes { | |
n := len(s) | |
t := make(suffixTypes, n) // all false(=L) | |
t[n-1] = S // for sentinel | |
for i := n-2; i >= 0; i-- { | |
prev := s[i+1] | |
current := s[i] | |
if prev > current || (prev == current && t[i+1]) { | |
t[i] = S | |
} | |
} | |
return t | |
} | |
func getBuckets(s text, charKinds int, end bool) buckets { | |
n := len(s) | |
bkt := make(buckets, charKinds+1) | |
for i := 0; i < n; i++ { | |
bkt[s[i]]++ | |
} | |
size := bkt[0] | |
bkt[0] = 0 | |
for i := 1; i < len(bkt); i++ { | |
if end { | |
bkt[i] = bkt[i-1] + bkt[i] | |
} else { | |
bkt[i], size = bkt[i-1] + size, bkt[i] | |
} | |
} | |
return bkt | |
} | |
func induceSAl(sa suffixArray, s text, t suffixTypes, charKinds int) { | |
n := len(s) | |
bkt := getBuckets(s, charKinds, false) | |
for i := 0; i < n; i++ { | |
if sa[i] > 0 && t[sa[i]-1] == L { | |
sa[bkt[s[sa[i]-1]]] = sa[i]-1 | |
bkt[s[sa[i]-1]]++ | |
} | |
} | |
} | |
func induceSAs(sa suffixArray, s text, t suffixTypes, charKinds int) { | |
n := len(s) | |
bkt := getBuckets(s, charKinds, true) | |
for i := n-1; i >= 0; i-- { | |
if sa[i] > 0 && t[sa[i]-1] == S { | |
sa[bkt[s[sa[i]-1]]] = sa[i]-1 | |
bkt[s[sa[i]-1]]-- | |
} | |
} | |
} | |
// Construct suffix array by SA-IS algorithm. | |
// Note that the last character of s should be sentinel(= 0). | |
func SAIS(s text, charKinds int) suffixArray { | |
n := len(s) | |
t := classifyTypes(s) | |
// --- | |
// stage 1: reduce the problem size by at least 1/2 | |
// --- | |
bkt := getBuckets(s, charKinds, true) | |
sa := newSuffixArray(n) | |
for i := 1; i < n; i++ { | |
if t.isLMS(i) { | |
sa[bkt[s[i]]] = i | |
bkt[s[i]]-- | |
} | |
} | |
// induce SAl | |
induceSAl(sa, s, t, charKinds) | |
// induce SAs | |
induceSAs(sa, s, t, charKinds) | |
// compact all the sorted LMS substrings | |
// the number of the LMS is not bigger than 2/n | |
n1 := 0 | |
for i := 0; i < n; i++ { | |
if t.isLMS(sa[i]) { | |
sa[n1] = sa[i] | |
n1++ | |
} | |
} | |
// find the lexicographic names of LMS substings | |
// it is stored in the right end of SA | |
for i := n1; i < n; i++ { | |
sa[i] = -1 | |
} | |
name, prev := 0, -1 | |
for i := 0; i < n1; i++ { | |
pos := sa[i] | |
diff := false | |
for d := 0; d < n; d++ { | |
if prev == -1 || s[pos+d] != s[prev+d] || t[pos+d] != t[prev+d] { | |
diff = true | |
break | |
} else if d > 0 && t.isLMS(pos+d) && t.isLMS(prev+d) { | |
// TODO: (t.isLMS(pos+d) || t.isLMS(prev+d)) in the literature, but should use '&&'? | |
break | |
} | |
} | |
if diff { | |
name++ | |
prev = pos | |
} | |
pos = pos/2 | |
sa[n1+pos] = name-1 | |
} | |
for i, j := n-1, n-1; i >= n1; i-- { | |
if sa[i] >= 0 { | |
sa[j] = sa[i] | |
j-- | |
} | |
} | |
// --- | |
// stage 2: solve the reduced problem | |
// recurse if names are not yet unique | |
// --- | |
sa1 := sa[:n1] | |
s1 := sa[(n-n1):] | |
if name < n1 { | |
sa1 = SAIS(newTextFromInts(s1), name-1) | |
} else { | |
for i := 0; i < n1; i++ { | |
sa1[s1[i]] = i | |
} | |
} | |
// --- | |
// stage 3: induce the SA | |
// --- | |
p1 := make([]int, n1) | |
for i, j := 0, 0; i < n; i++ { | |
if t.isLMS(i) { | |
p1[j] = i | |
j++ | |
} | |
} | |
for i := 0; i < n1; i++ { | |
sa1[i] = p1[sa1[i]] | |
} | |
// init sa | |
for i := n1; i < n; i++ { | |
sa[i] = -1 | |
} | |
bkt = getBuckets(s, charKinds, true) | |
for i := n1-1; i >= 0; i-- { | |
j := sa1[i] | |
sa[i] = -1 | |
sa[bkt[s[j]]] = j | |
bkt[s[j]]-- | |
} | |
induceSAl(sa, s, t, charKinds) | |
induceSAs(sa, s, t, charKinds) | |
return sa | |
} | |
const MAX_INPUT = 1000000 | |
func parseInput() (text, []string, error) { | |
sc := bufio.NewScanner(os.Stdin) | |
buf := make([]byte, MAX_INPUT+1) | |
sc.Buffer(buf, MAX_INPUT+1) | |
sc.Split(bufio.ScanWords) | |
text := newTextFromString(nextText(sc)) | |
n, err := nextInt(sc) | |
if err != nil { | |
return nil, nil, err | |
} | |
query := make([]string, n) | |
for i := 0; i < n; i++ { | |
query[i] = nextText(sc) | |
} | |
return text, query, nil | |
} | |
func nextText(sc *bufio.Scanner) string { | |
sc.Scan() | |
return sc.Text() | |
} | |
func nextInt(sc *bufio.Scanner) (int, error) { | |
return strconv.Atoi(nextText(sc)) | |
} | |
/* | |
Return 0 if prefix is at the beginning of text. | |
If not, return 1 or -1 comparing strings. | |
*/ | |
func comparePrefix(text []rune, prefix []rune) int { | |
for i := 0; i < len(prefix); i++ { | |
if text[i] < prefix[i] { | |
return 1 | |
} else if text[i] > prefix[i] { | |
return -1 | |
} | |
} | |
return 0 | |
} | |
func binarySearch(key []rune, sa []int, text text) bool { | |
s := 0 | |
e := len(sa) - 1 | |
m := (e + s) / 2 | |
for s <= e { | |
suffix := text[sa[m]:] | |
switch comparePrefix(suffix, key) { | |
case 0: | |
return true | |
case 1: | |
s = m + 1 | |
case -1: | |
e = m - 1 | |
} | |
m = (e + s) / 2 | |
} | |
return false | |
} | |
/** | |
* Accept text and keyword from stdin like: | |
* | |
* abababab | |
* 3 | |
* ab | |
* bb | |
* aa | |
* | |
* Return 1 if text contains keyword and 0 if not. | |
*/ | |
func main() { | |
text, query, err := parseInput() | |
if err != nil { | |
panic(err) | |
} | |
sa := SAIS(text, math.MaxUint8) | |
if err != nil { | |
panic(err) | |
} | |
// fmt.Println(query) | |
// fmt.Println(sa) | |
result := make([]string, len(query)) | |
for i, q := range query { | |
if binarySearch([]rune(q), sa, text) { | |
result[i] = "1" | |
} else { | |
result[i] = "0" | |
} | |
} | |
fmt.Println(strings.Join(result, "\n")) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment