mixio/find_longest_repeated_substring.go

## find_longest_repeated_substring.go
#!/usr/bin/env gorun

// Original code: https://rosettacode.org/wiki/Ukkonen’s_suffix_tree_construction

/*
Here is a BBEdit Text Filter that will scan the frontmost document's selected text (or the whole document in no selection) for the longest repeated substring and log a regular expression in the 'Unix Script Output.log' that allows you to find the repetition.

It works by replacing all non alphanumeric or underscore characters with a regular expression, thus comparing only the 'text' and ignoring anything else: whitespace, punctuation, math operators, etc.

As a consequence, it's more useful on textual content than on code.

It is derived from the go version of Ukkonen’s suffix tree construction https://rosettacode.org/wiki/Ukkonen’s_suffix_tree_construction and uses the gorun utility https://github.com/erning/gorun that allows to execute a go source file as a shell script.

 1. To install Go

  • from Go's installer: https://go.dev/doc/install

  • or with Homebrew at the terminal:

        % brew install go

 2. To install gorun at the terminal:

        % go install github.com/erning/gorun@latest

 3. Copy find_longest_repeated_substring.go to ~/Library/Application Support/BBEdit/Text Filters

 4. Use the text filter from the Text menu > Apply Text Filter > find_longest_repeated_substring

 5. In case the 'Unix Script Output.log' doesn't show up, use the Menu Go > Commands... panel to find the 'Unix Script Output.log'

 6. In the 'Unix Script Output.log', select the logged regular expression and copy it to the find window with <Command-Shift-E>.
    (Warning: Do not copy the trailing return character as it is not part of the regular expression)

 7. Activate you document and use the find window to search the occurrences of the repetition.
*/

package main

import (
    "io/ioutil"
    "fmt"
    "os"
    "log"
    "regexp"
)

var maxChar = 256

type Node struct {
    children    []*Node
    suffixLink  *Node
    start       int
    end         *int
    suffixIndex int
}

var (
    text                 string
    root                 *Node
    lastNewNode          *Node
    activeNode           *Node
    activeEdge           = -1
    activeLength         = 0
    remainingSuffixCount = 0
    leafEnd              = -1
    rootEnd              *int
    splitEnd             *int
    size                 = -1
)

func newNode(start int, end *int) *Node {
    node := new(Node)
    node.children = make([]*Node, maxChar)
    node.suffixLink = root
    node.start = start
    node.end = end
    node.suffixIndex = -1
    return node
}

func edgeLength(n *Node) int {
    if n == root {
        return 0
    }
    return *(n.end) - n.start + 1
}

func walkDown(currNode *Node) bool {
    if activeLength >= edgeLength(currNode) {
        activeEdge += edgeLength(currNode)
        activeLength -= edgeLength(currNode)
        activeNode = currNode
        return true
    }
    return false
}

func extendSuffixTree(pos int) {
    leafEnd = pos
    remainingSuffixCount++
    lastNewNode = nil
    for remainingSuffixCount > 0 {
        if activeLength == 0 {
            activeEdge = pos
        }
        if activeNode.children[text[activeEdge]] == nil {
            activeNode.children[text[activeEdge]] = newNode(pos, &leafEnd)
            if lastNewNode != nil {
                lastNewNode.suffixLink = activeNode
                lastNewNode = nil
            }
        } else {
            next := activeNode.children[text[activeEdge]]
            if walkDown(next) {
                continue
            }
            if text[next.start+activeLength] == text[pos] {
                if lastNewNode != nil && activeNode != root {
                    lastNewNode.suffixLink = activeNode
                    lastNewNode = nil
                }
                activeLength++
                break
            }
            temp := next.start + activeLength - 1
            splitEnd = &temp
            split := newNode(next.start, splitEnd)
            activeNode.children[text[activeEdge]] = split
            split.children[text[pos]] = newNode(pos, &leafEnd)
            next.start += activeLength
            split.children[text[next.start]] = next
            if lastNewNode != nil {
                lastNewNode.suffixLink = split
            }
            lastNewNode = split
        }
        remainingSuffixCount--
        if activeNode == root && activeLength > 0 {
            activeLength--
            activeEdge = pos - remainingSuffixCount + 1
        } else if activeNode != root {
            activeNode = activeNode.suffixLink
        }
    }
}

func setSuffixIndexByDFS(n *Node, labelHeight int) {
    if n == nil {
        return
    }
    if n.start != -1 {
        // Uncomment line below to print suffix tree
        // log.Print(text[n.start: *(n.end) +1])
    }
    leaf := 1
    for i := 0; i < maxChar; i++ {
        if n.children[i] != nil {
            // Uncomment the 3 lines below to print suffix index
            //if leaf == 1 && n.start != -1 {
            //    log.Printf(" [%d]\n", n.suffixIndex)
            //}
            leaf = 0
            setSuffixIndexByDFS(n.children[i], labelHeight+edgeLength(n.children[i]))
        }
    }
    if leaf == 1 {
        n.suffixIndex = size - labelHeight
        // Uncomment line below to print suffix index
        //log.Printf(" [%d]\n", n.suffixIndex)
    }
}

func buildSuffixTree() {
    size = len(text)
    temp := -1
    rootEnd = &temp
    root = newNode(-1, rootEnd)
    activeNode = root
    for i := 0; i < size; i++ {
        extendSuffixTree(i)
    }
    labelHeight := 0
    setSuffixIndexByDFS(root, labelHeight)
}

func doTraversal(n *Node, labelHeight int, maxHeight, substringStartIndex *int) {
    if n == nil {
        return
    }
    if n.suffixIndex == -1 {
        for i := 0; i < maxChar; i++ {
            if n.children[i] != nil {
                doTraversal(n.children[i], labelHeight+edgeLength(n.children[i]),
                    maxHeight, substringStartIndex)
            }
        }
    } else if n.suffixIndex > -1 && (*maxHeight < labelHeight-edgeLength(n)) {
        *maxHeight = labelHeight - edgeLength(n)
        *substringStartIndex = n.suffixIndex
    }
}

func getLongestRepeatedSubstringRegex(s string) string {
    maxHeight := 0
    substringStartIndex := 0
    doTraversal(root, 0, &maxHeight, &substringStartIndex)
    // Uncomment line below to print maxHeight and substringStartIndex
    // log.Printf("maxHeight %d, substringStartIndex %d\n", maxHeight, substringStartIndex)
    /*
    if s == "" {
        log.Printf("  %s is: ", text)
    } else {
        log.Printf("  %s is: ", s)
    }
    */
    if maxHeight == 0 {
        log.Print("No repeated substring")
        return ""
    } else {
        output := text[substringStartIndex:substringStartIndex+maxHeight]
        leading_regex := regexp.MustCompile(`^\([^)]+?\)`)
        trailing_regex := regexp.MustCompile(`\([^)]+?\)\z`)
        output = leading_regex.ReplaceAllString(output, "")
        output = trailing_regex.ReplaceAllString(output, "")
        return output
    }
}

func main() {
    bytes, err := ioutil.ReadAll(os.Stdin)
    if err != nil {
        log.Println(err)
    } else {
        input := string(bytes)
        regex := regexp.MustCompile(`[^\p{L}\p{N}_]+`)
        text = regex.ReplaceAllString(input, "([^\\p{L}\\p{N}_]+?)") + "$"
        buildSuffixTree()
        output := getLongestRepeatedSubstringRegex("")
        if output != "" {
            log.Printf("regex:\n(?s)%s", output)
        }
        fmt.Print(input)
    }
}

func test() {
    tests := []string{
        "GEEKSFORGEEKS$",
        "AAAAAAAAAA$",
        "ABCDEFG$",
        "ABABABA$",
        "ATCGATCGA$",
        "banana$",
        "abcpqrabpqpq$",
        "pqrpqpqabab$",
    }
    log.Println("Longest Repeated Substring in:\n")
    for _, test := range tests {
        text = test
        buildSuffixTree()
        getLongestRepeatedSubstringRegex("")
    }
    fmt.Print(text)
}
	#!/usr/bin/env gorun

	// Original code: https://rosettacode.org/wiki/Ukkonen’s_suffix_tree_construction

	/*
	Here is a BBEdit Text Filter that will scan the frontmost document's selected text (or the whole document in no selection) for the longest repeated substring and log a regular expression in the 'Unix Script Output.log' that allows you to find the repetition.

	It works by replacing all non alphanumeric or underscore characters with a regular expression, thus comparing only the 'text' and ignoring anything else: whitespace, punctuation, math operators, etc.

	As a consequence, it's more useful on textual content than on code.

	It is derived from the go version of Ukkonen’s suffix tree construction https://rosettacode.org/wiki/Ukkonen’s_suffix_tree_construction and uses the gorun utility https://github.com/erning/gorun that allows to execute a go source file as a shell script.

	1. To install Go

	• from Go's installer: https://go.dev/doc/install

	• or with Homebrew at the terminal:

	% brew install go

	2. To install gorun at the terminal:

	% go install github.com/erning/gorun@latest

	3. Copy find_longest_repeated_substring.go to ~/Library/Application Support/BBEdit/Text Filters

	4. Use the text filter from the Text menu > Apply Text Filter > find_longest_repeated_substring

	5. In case the 'Unix Script Output.log' doesn't show up, use the Menu Go > Commands... panel to find the 'Unix Script Output.log'

	6. In the 'Unix Script Output.log', select the logged regular expression and copy it to the find window with <Command-Shift-E>.
	(Warning: Do not copy the trailing return character as it is not part of the regular expression)

	7. Activate you document and use the find window to search the occurrences of the repetition.
	*/

	package main

	import (
	"io/ioutil"
	"fmt"
	"os"
	"log"
	"regexp"
	)

	var maxChar = 256

	type Node struct {
	children []*Node
	suffixLink *Node
	start int
	end *int
	suffixIndex int
	}

	var (
	text string
	root *Node
	lastNewNode *Node
	activeNode *Node
	activeEdge = -1
	activeLength = 0
	remainingSuffixCount = 0
	leafEnd = -1
	rootEnd *int
	splitEnd *int
	size = -1
	)

	func newNode(start int, end int) Node {
	node := new(Node)
	node.children = make([]*Node, maxChar)
	node.suffixLink = root
	node.start = start
	node.end = end
	node.suffixIndex = -1
	return node
	}

	func edgeLength(n *Node) int {
	if n == root {
	return 0
	}
	return *(n.end) - n.start + 1
	}

	func walkDown(currNode *Node) bool {
	if activeLength >= edgeLength(currNode) {
	activeEdge += edgeLength(currNode)
	activeLength -= edgeLength(currNode)
	activeNode = currNode
	return true
	}
	return false
	}

	func extendSuffixTree(pos int) {
	leafEnd = pos
	remainingSuffixCount++
	lastNewNode = nil
	for remainingSuffixCount > 0 {
	if activeLength == 0 {
	activeEdge = pos
	}
	if activeNode.children[text[activeEdge]] == nil {
	activeNode.children[text[activeEdge]] = newNode(pos, &leafEnd)
	if lastNewNode != nil {
	lastNewNode.suffixLink = activeNode
	lastNewNode = nil
	}
	} else {
	next := activeNode.children[text[activeEdge]]
	if walkDown(next) {
	continue
	}
	if text[next.start+activeLength] == text[pos] {
	if lastNewNode != nil && activeNode != root {
	lastNewNode.suffixLink = activeNode
	lastNewNode = nil
	}
	activeLength++
	break
	}
	temp := next.start + activeLength - 1
	splitEnd = &temp
	split := newNode(next.start, splitEnd)
	activeNode.children[text[activeEdge]] = split
	split.children[text[pos]] = newNode(pos, &leafEnd)
	next.start += activeLength
	split.children[text[next.start]] = next
	if lastNewNode != nil {
	lastNewNode.suffixLink = split
	}
	lastNewNode = split
	}
	remainingSuffixCount--
	if activeNode == root && activeLength > 0 {
	activeLength--
	activeEdge = pos - remainingSuffixCount + 1
	} else if activeNode != root {
	activeNode = activeNode.suffixLink
	}
	}
	}

	func setSuffixIndexByDFS(n *Node, labelHeight int) {
	if n == nil {
	return
	}
	if n.start != -1 {
	// Uncomment line below to print suffix tree
	// log.Print(text[n.start: *(n.end) +1])
	}
	leaf := 1
	for i := 0; i < maxChar; i++ {
	if n.children[i] != nil {
	// Uncomment the 3 lines below to print suffix index
	//if leaf == 1 && n.start != -1 {
	// log.Printf(" [%d]\n", n.suffixIndex)
	//}
	leaf = 0
	setSuffixIndexByDFS(n.children[i], labelHeight+edgeLength(n.children[i]))
	}
	}
	if leaf == 1 {
	n.suffixIndex = size - labelHeight
	// Uncomment line below to print suffix index
	//log.Printf(" [%d]\n", n.suffixIndex)
	}
	}

	func buildSuffixTree() {
	size = len(text)
	temp := -1
	rootEnd = &temp
	root = newNode(-1, rootEnd)
	activeNode = root
	for i := 0; i < size; i++ {
	extendSuffixTree(i)
	}
	labelHeight := 0
	setSuffixIndexByDFS(root, labelHeight)
	}

	func doTraversal(n Node, labelHeight int, maxHeight, substringStartIndex int) {
	if n == nil {
	return
	}
	if n.suffixIndex == -1 {
	for i := 0; i < maxChar; i++ {
	if n.children[i] != nil {
	doTraversal(n.children[i], labelHeight+edgeLength(n.children[i]),
	maxHeight, substringStartIndex)
	}
	}
	} else if n.suffixIndex > -1 && (*maxHeight < labelHeight-edgeLength(n)) {
	*maxHeight = labelHeight - edgeLength(n)
	*substringStartIndex = n.suffixIndex
	}
	}

	func getLongestRepeatedSubstringRegex(s string) string {
	maxHeight := 0
	substringStartIndex := 0
	doTraversal(root, 0, &maxHeight, &substringStartIndex)
	// Uncomment line below to print maxHeight and substringStartIndex
	// log.Printf("maxHeight %d, substringStartIndex %d\n", maxHeight, substringStartIndex)
	/*
	if s == "" {
	log.Printf(" %s is: ", text)
	} else {
	log.Printf(" %s is: ", s)
	}
	*/
	if maxHeight == 0 {
	log.Print("No repeated substring")
	return ""
	} else {
	output := text[substringStartIndex:substringStartIndex+maxHeight]
	leading_regex := regexp.MustCompile(`^\([^)]+?\)`)
	trailing_regex := regexp.MustCompile(`\([^)]+?\)\z`)
	output = leading_regex.ReplaceAllString(output, "")
	output = trailing_regex.ReplaceAllString(output, "")
	return output
	}
	}

	func main() {
	bytes, err := ioutil.ReadAll(os.Stdin)
	if err != nil {
	log.Println(err)
	} else {
	input := string(bytes)
	regex := regexp.MustCompile(`[^\p{L}\p{N}_]+`)
	text = regex.ReplaceAllString(input, "([^\\p{L}\\p{N}_]+?)") + "$"
	buildSuffixTree()
	output := getLongestRepeatedSubstringRegex("")
	if output != "" {
	log.Printf("regex:\n(?s)%s", output)
	}
	fmt.Print(input)
	}
	}

	func test() {
	tests := []string{
	"GEEKSFORGEEKS$",
	"AAAAAAAAAA$",
	"ABCDEFG$",
	"ABABABA$",
	"ATCGATCGA$",
	"banana$",
	"abcpqrabpqpq$",
	"pqrpqpqabab$",
	}
	log.Println("Longest Repeated Substring in:\n")
	for _, test := range tests {
	text = test
	buildSuffixTree()
	getLongestRepeatedSubstringRegex("")
	}
	fmt.Print(text)
	}