Skip to content

Instantly share code, notes, and snippets.

@tiqwab tiqwab/sais.go
Created Sep 17, 2018

Embed
What would you like to do?
Construct suffix array by SA-IS
package main
import (
"bufio"
"fmt"
"os"
"strconv"
"strings"
"math"
)
// ---
// ref. Two Efficient Algorithms for Linear Time Suffix Array Construction
// ---
type text []rune
// ---
// Define suffixArray
// ---
type suffixArray []int
func newSuffixArray(n int) suffixArray {
sa := make(suffixArray, n)
sa.init()
return sa
}
func (sa suffixArray) init() {
for i := 0; i < len(sa); i++ {
sa[i] = -1
}
}
// ---
// Define buckets
// ---
type buckets []int
// ---
// Define suffix type, S or L
// ---
type suffixType bool
type suffixTypes []suffixType // true: S, false: L
const (
S suffixType = true
L suffixType = false
)
func (st suffixType) String() string {
if (st) {
return "S"
} else {
return "L"
}
}
func (sts suffixTypes) isLMS(i int) bool {
if i == 0 {
return false
}
return sts[i-1] == L && sts[i] == S
}
// Generate text from string.
// This function adds sentinel at the end of text.
func newTextFromString(raw string) text {
n := len(raw)
s := make(text, n+1) // +1 for sentinel
for i := 0; i < n; i++ {
s[i] = rune(raw[i])
}
s[n] = 0
return s
}
// Generate text from []rune.
// This function adds sentinel at the end of text.
func newTextFromRunes(runes []rune) text {
n := len(runes)
s := make(text, n+1) // +1 for sentinel
for i := 0; i < n; i++ {
s[i] = runes[i]
}
s[n] = 0
return s
}
// Generate text from []int.
// This function does not add sentinel.
func newTextFromInts(ints []int) text {
n := len(ints)
s := make(text, n)
for i := 0; i < n; i++ {
s[i] = rune(ints[i])
}
return s
}
func classifyTypes(s text) suffixTypes {
n := len(s)
t := make(suffixTypes, n) // all false(=L)
t[n-1] = S // for sentinel
for i := n-2; i >= 0; i-- {
prev := s[i+1]
current := s[i]
if prev > current || (prev == current && t[i+1]) {
t[i] = S
}
}
return t
}
func getBuckets(s text, charKinds int, end bool) buckets {
n := len(s)
bkt := make(buckets, charKinds+1)
for i := 0; i < n; i++ {
bkt[s[i]]++
}
size := bkt[0]
bkt[0] = 0
for i := 1; i < len(bkt); i++ {
if end {
bkt[i] = bkt[i-1] + bkt[i]
} else {
bkt[i], size = bkt[i-1] + size, bkt[i]
}
}
return bkt
}
func induceSAl(sa suffixArray, s text, t suffixTypes, charKinds int) {
n := len(s)
bkt := getBuckets(s, charKinds, false)
for i := 0; i < n; i++ {
if sa[i] > 0 && t[sa[i]-1] == L {
sa[bkt[s[sa[i]-1]]] = sa[i]-1
bkt[s[sa[i]-1]]++
}
}
}
func induceSAs(sa suffixArray, s text, t suffixTypes, charKinds int) {
n := len(s)
bkt := getBuckets(s, charKinds, true)
for i := n-1; i >= 0; i-- {
if sa[i] > 0 && t[sa[i]-1] == S {
sa[bkt[s[sa[i]-1]]] = sa[i]-1
bkt[s[sa[i]-1]]--
}
}
}
// Construct suffix array by SA-IS algorithm.
// Note that the last character of s should be sentinel(= 0).
func SAIS(s text, charKinds int) suffixArray {
n := len(s)
t := classifyTypes(s)
// ---
// stage 1: reduce the problem size by at least 1/2
// ---
bkt := getBuckets(s, charKinds, true)
sa := newSuffixArray(n)
for i := 1; i < n; i++ {
if t.isLMS(i) {
sa[bkt[s[i]]] = i
bkt[s[i]]--
}
}
// induce SAl
induceSAl(sa, s, t, charKinds)
// induce SAs
induceSAs(sa, s, t, charKinds)
// compact all the sorted LMS substrings
// the number of the LMS is not bigger than 2/n
n1 := 0
for i := 0; i < n; i++ {
if t.isLMS(sa[i]) {
sa[n1] = sa[i]
n1++
}
}
// find the lexicographic names of LMS substings
// it is stored in the right end of SA
for i := n1; i < n; i++ {
sa[i] = -1
}
name, prev := 0, -1
for i := 0; i < n1; i++ {
pos := sa[i]
diff := false
for d := 0; d < n; d++ {
if prev == -1 || s[pos+d] != s[prev+d] || t[pos+d] != t[prev+d] {
diff = true
break
} else if d > 0 && t.isLMS(pos+d) && t.isLMS(prev+d) {
// TODO: (t.isLMS(pos+d) || t.isLMS(prev+d)) in the literature, but should use '&&'?
break
}
}
if diff {
name++
prev = pos
}
pos = pos/2
sa[n1+pos] = name-1
}
for i, j := n-1, n-1; i >= n1; i-- {
if sa[i] >= 0 {
sa[j] = sa[i]
j--
}
}
// ---
// stage 2: solve the reduced problem
// recurse if names are not yet unique
// ---
sa1 := sa[:n1]
s1 := sa[(n-n1):]
if name < n1 {
sa1 = SAIS(newTextFromInts(s1), name-1)
} else {
for i := 0; i < n1; i++ {
sa1[s1[i]] = i
}
}
// ---
// stage 3: induce the SA
// ---
p1 := make([]int, n1)
for i, j := 0, 0; i < n; i++ {
if t.isLMS(i) {
p1[j] = i
j++
}
}
for i := 0; i < n1; i++ {
sa1[i] = p1[sa1[i]]
}
// init sa
for i := n1; i < n; i++ {
sa[i] = -1
}
bkt = getBuckets(s, charKinds, true)
for i := n1-1; i >= 0; i-- {
j := sa1[i]
sa[i] = -1
sa[bkt[s[j]]] = j
bkt[s[j]]--
}
induceSAl(sa, s, t, charKinds)
induceSAs(sa, s, t, charKinds)
return sa
}
const MAX_INPUT = 1000000
func parseInput() (text, []string, error) {
sc := bufio.NewScanner(os.Stdin)
buf := make([]byte, MAX_INPUT+1)
sc.Buffer(buf, MAX_INPUT+1)
sc.Split(bufio.ScanWords)
text := newTextFromString(nextText(sc))
n, err := nextInt(sc)
if err != nil {
return nil, nil, err
}
query := make([]string, n)
for i := 0; i < n; i++ {
query[i] = nextText(sc)
}
return text, query, nil
}
func nextText(sc *bufio.Scanner) string {
sc.Scan()
return sc.Text()
}
func nextInt(sc *bufio.Scanner) (int, error) {
return strconv.Atoi(nextText(sc))
}
/*
Return 0 if prefix is at the beginning of text.
If not, return 1 or -1 comparing strings.
*/
func comparePrefix(text []rune, prefix []rune) int {
for i := 0; i < len(prefix); i++ {
if text[i] < prefix[i] {
return 1
} else if text[i] > prefix[i] {
return -1
}
}
return 0
}
func binarySearch(key []rune, sa []int, text text) bool {
s := 0
e := len(sa) - 1
m := (e + s) / 2
for s <= e {
suffix := text[sa[m]:]
switch comparePrefix(suffix, key) {
case 0:
return true
case 1:
s = m + 1
case -1:
e = m - 1
}
m = (e + s) / 2
}
return false
}
/**
* Accept text and keyword from stdin like:
*
* abababab
* 3
* ab
* bb
* aa
*
* Return 1 if text contains keyword and 0 if not.
*/
func main() {
text, query, err := parseInput()
if err != nil {
panic(err)
}
sa := SAIS(text, math.MaxUint8)
if err != nil {
panic(err)
}
// fmt.Println(query)
// fmt.Println(sa)
result := make([]string, len(query))
for i, q := range query {
if binarySearch([]rune(q), sa, text) {
result[i] = "1"
} else {
result[i] = "0"
}
}
fmt.Println(strings.Join(result, "\n"))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.