Skip to content

Instantly share code, notes, and snippets.

@mixio
Created April 30, 2022 18:28
Show Gist options
  • Save mixio/8258238888164c54be3f4d32d3ff2dc2 to your computer and use it in GitHub Desktop.
Save mixio/8258238888164c54be3f4d32d3ff2dc2 to your computer and use it in GitHub Desktop.
A BBEdit Text Filter that will help find a document's longest repetition
#!/usr/bin/env gorun
// Original code: https://rosettacode.org/wiki/Ukkonen’s_suffix_tree_construction
/*
Here is a BBEdit Text Filter that will scan the frontmost document's selected text (or the whole document in no selection) for the longest repeated substring and log a regular expression in the 'Unix Script Output.log' that allows you to find the repetition.
It works by replacing all non alphanumeric or underscore characters with a regular expression, thus comparing only the 'text' and ignoring anything else: whitespace, punctuation, math operators, etc.
As a consequence, it's more useful on textual content than on code.
It is derived from the go version of Ukkonen’s suffix tree construction https://rosettacode.org/wiki/Ukkonen’s_suffix_tree_construction and uses the gorun utility https://github.com/erning/gorun that allows to execute a go source file as a shell script.
1. To install Go
• from Go's installer: https://go.dev/doc/install
• or with Homebrew at the terminal:
% brew install go
2. To install gorun at the terminal:
% go install github.com/erning/gorun@latest
3. Copy find_longest_repeated_substring.go to ~/Library/Application Support/BBEdit/Text Filters
4. Use the text filter from the Text menu > Apply Text Filter > find_longest_repeated_substring
5. In case the 'Unix Script Output.log' doesn't show up, use the Menu Go > Commands... panel to find the 'Unix Script Output.log'
6. In the 'Unix Script Output.log', select the logged regular expression and copy it to the find window with <Command-Shift-E>.
(Warning: Do not copy the trailing return character as it is not part of the regular expression)
7. Activate you document and use the find window to search the occurrences of the repetition.
*/
package main
import (
"io/ioutil"
"fmt"
"os"
"log"
"regexp"
)
var maxChar = 256
type Node struct {
children []*Node
suffixLink *Node
start int
end *int
suffixIndex int
}
var (
text string
root *Node
lastNewNode *Node
activeNode *Node
activeEdge = -1
activeLength = 0
remainingSuffixCount = 0
leafEnd = -1
rootEnd *int
splitEnd *int
size = -1
)
func newNode(start int, end *int) *Node {
node := new(Node)
node.children = make([]*Node, maxChar)
node.suffixLink = root
node.start = start
node.end = end
node.suffixIndex = -1
return node
}
func edgeLength(n *Node) int {
if n == root {
return 0
}
return *(n.end) - n.start + 1
}
func walkDown(currNode *Node) bool {
if activeLength >= edgeLength(currNode) {
activeEdge += edgeLength(currNode)
activeLength -= edgeLength(currNode)
activeNode = currNode
return true
}
return false
}
func extendSuffixTree(pos int) {
leafEnd = pos
remainingSuffixCount++
lastNewNode = nil
for remainingSuffixCount > 0 {
if activeLength == 0 {
activeEdge = pos
}
if activeNode.children[text[activeEdge]] == nil {
activeNode.children[text[activeEdge]] = newNode(pos, &leafEnd)
if lastNewNode != nil {
lastNewNode.suffixLink = activeNode
lastNewNode = nil
}
} else {
next := activeNode.children[text[activeEdge]]
if walkDown(next) {
continue
}
if text[next.start+activeLength] == text[pos] {
if lastNewNode != nil && activeNode != root {
lastNewNode.suffixLink = activeNode
lastNewNode = nil
}
activeLength++
break
}
temp := next.start + activeLength - 1
splitEnd = &temp
split := newNode(next.start, splitEnd)
activeNode.children[text[activeEdge]] = split
split.children[text[pos]] = newNode(pos, &leafEnd)
next.start += activeLength
split.children[text[next.start]] = next
if lastNewNode != nil {
lastNewNode.suffixLink = split
}
lastNewNode = split
}
remainingSuffixCount--
if activeNode == root && activeLength > 0 {
activeLength--
activeEdge = pos - remainingSuffixCount + 1
} else if activeNode != root {
activeNode = activeNode.suffixLink
}
}
}
func setSuffixIndexByDFS(n *Node, labelHeight int) {
if n == nil {
return
}
if n.start != -1 {
// Uncomment line below to print suffix tree
// log.Print(text[n.start: *(n.end) +1])
}
leaf := 1
for i := 0; i < maxChar; i++ {
if n.children[i] != nil {
// Uncomment the 3 lines below to print suffix index
//if leaf == 1 && n.start != -1 {
// log.Printf(" [%d]\n", n.suffixIndex)
//}
leaf = 0
setSuffixIndexByDFS(n.children[i], labelHeight+edgeLength(n.children[i]))
}
}
if leaf == 1 {
n.suffixIndex = size - labelHeight
// Uncomment line below to print suffix index
//log.Printf(" [%d]\n", n.suffixIndex)
}
}
func buildSuffixTree() {
size = len(text)
temp := -1
rootEnd = &temp
root = newNode(-1, rootEnd)
activeNode = root
for i := 0; i < size; i++ {
extendSuffixTree(i)
}
labelHeight := 0
setSuffixIndexByDFS(root, labelHeight)
}
func doTraversal(n *Node, labelHeight int, maxHeight, substringStartIndex *int) {
if n == nil {
return
}
if n.suffixIndex == -1 {
for i := 0; i < maxChar; i++ {
if n.children[i] != nil {
doTraversal(n.children[i], labelHeight+edgeLength(n.children[i]),
maxHeight, substringStartIndex)
}
}
} else if n.suffixIndex > -1 && (*maxHeight < labelHeight-edgeLength(n)) {
*maxHeight = labelHeight - edgeLength(n)
*substringStartIndex = n.suffixIndex
}
}
func getLongestRepeatedSubstringRegex(s string) string {
maxHeight := 0
substringStartIndex := 0
doTraversal(root, 0, &maxHeight, &substringStartIndex)
// Uncomment line below to print maxHeight and substringStartIndex
// log.Printf("maxHeight %d, substringStartIndex %d\n", maxHeight, substringStartIndex)
/*
if s == "" {
log.Printf(" %s is: ", text)
} else {
log.Printf(" %s is: ", s)
}
*/
if maxHeight == 0 {
log.Print("No repeated substring")
return ""
} else {
output := text[substringStartIndex:substringStartIndex+maxHeight]
leading_regex := regexp.MustCompile(`^\([^)]+?\)`)
trailing_regex := regexp.MustCompile(`\([^)]+?\)\z`)
output = leading_regex.ReplaceAllString(output, "")
output = trailing_regex.ReplaceAllString(output, "")
return output
}
}
func main() {
bytes, err := ioutil.ReadAll(os.Stdin)
if err != nil {
log.Println(err)
} else {
input := string(bytes)
regex := regexp.MustCompile(`[^\p{L}\p{N}_]+`)
text = regex.ReplaceAllString(input, "([^\\p{L}\\p{N}_]+?)") + "$"
buildSuffixTree()
output := getLongestRepeatedSubstringRegex("")
if output != "" {
log.Printf("regex:\n(?s)%s", output)
}
fmt.Print(input)
}
}
func test() {
tests := []string{
"GEEKSFORGEEKS$",
"AAAAAAAAAA$",
"ABCDEFG$",
"ABABABA$",
"ATCGATCGA$",
"banana$",
"abcpqrabpqpq$",
"pqrpqpqabab$",
}
log.Println("Longest Repeated Substring in:\n")
for _, test := range tests {
text = test
buildSuffixTree()
getLongestRepeatedSubstringRegex("")
}
fmt.Print(text)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment