Created
April 30, 2022 18:28
-
-
Save mixio/8258238888164c54be3f4d32d3ff2dc2 to your computer and use it in GitHub Desktop.
A BBEdit Text Filter that will help find a document's longest repetition
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env gorun | |
// Original code: https://rosettacode.org/wiki/Ukkonen’s_suffix_tree_construction | |
/* | |
Here is a BBEdit Text Filter that will scan the frontmost document's selected text (or the whole document in no selection) for the longest repeated substring and log a regular expression in the 'Unix Script Output.log' that allows you to find the repetition. | |
It works by replacing all non alphanumeric or underscore characters with a regular expression, thus comparing only the 'text' and ignoring anything else: whitespace, punctuation, math operators, etc. | |
As a consequence, it's more useful on textual content than on code. | |
It is derived from the go version of Ukkonen’s suffix tree construction https://rosettacode.org/wiki/Ukkonen’s_suffix_tree_construction and uses the gorun utility https://github.com/erning/gorun that allows to execute a go source file as a shell script. | |
1. To install Go | |
• from Go's installer: https://go.dev/doc/install | |
• or with Homebrew at the terminal: | |
% brew install go | |
2. To install gorun at the terminal: | |
% go install github.com/erning/gorun@latest | |
3. Copy find_longest_repeated_substring.go to ~/Library/Application Support/BBEdit/Text Filters | |
4. Use the text filter from the Text menu > Apply Text Filter > find_longest_repeated_substring | |
5. In case the 'Unix Script Output.log' doesn't show up, use the Menu Go > Commands... panel to find the 'Unix Script Output.log' | |
6. In the 'Unix Script Output.log', select the logged regular expression and copy it to the find window with <Command-Shift-E>. | |
(Warning: Do not copy the trailing return character as it is not part of the regular expression) | |
7. Activate you document and use the find window to search the occurrences of the repetition. | |
*/ | |
package main | |
import ( | |
"io/ioutil" | |
"fmt" | |
"os" | |
"log" | |
"regexp" | |
) | |
var maxChar = 256 | |
type Node struct { | |
children []*Node | |
suffixLink *Node | |
start int | |
end *int | |
suffixIndex int | |
} | |
var ( | |
text string | |
root *Node | |
lastNewNode *Node | |
activeNode *Node | |
activeEdge = -1 | |
activeLength = 0 | |
remainingSuffixCount = 0 | |
leafEnd = -1 | |
rootEnd *int | |
splitEnd *int | |
size = -1 | |
) | |
func newNode(start int, end *int) *Node { | |
node := new(Node) | |
node.children = make([]*Node, maxChar) | |
node.suffixLink = root | |
node.start = start | |
node.end = end | |
node.suffixIndex = -1 | |
return node | |
} | |
func edgeLength(n *Node) int { | |
if n == root { | |
return 0 | |
} | |
return *(n.end) - n.start + 1 | |
} | |
func walkDown(currNode *Node) bool { | |
if activeLength >= edgeLength(currNode) { | |
activeEdge += edgeLength(currNode) | |
activeLength -= edgeLength(currNode) | |
activeNode = currNode | |
return true | |
} | |
return false | |
} | |
func extendSuffixTree(pos int) { | |
leafEnd = pos | |
remainingSuffixCount++ | |
lastNewNode = nil | |
for remainingSuffixCount > 0 { | |
if activeLength == 0 { | |
activeEdge = pos | |
} | |
if activeNode.children[text[activeEdge]] == nil { | |
activeNode.children[text[activeEdge]] = newNode(pos, &leafEnd) | |
if lastNewNode != nil { | |
lastNewNode.suffixLink = activeNode | |
lastNewNode = nil | |
} | |
} else { | |
next := activeNode.children[text[activeEdge]] | |
if walkDown(next) { | |
continue | |
} | |
if text[next.start+activeLength] == text[pos] { | |
if lastNewNode != nil && activeNode != root { | |
lastNewNode.suffixLink = activeNode | |
lastNewNode = nil | |
} | |
activeLength++ | |
break | |
} | |
temp := next.start + activeLength - 1 | |
splitEnd = &temp | |
split := newNode(next.start, splitEnd) | |
activeNode.children[text[activeEdge]] = split | |
split.children[text[pos]] = newNode(pos, &leafEnd) | |
next.start += activeLength | |
split.children[text[next.start]] = next | |
if lastNewNode != nil { | |
lastNewNode.suffixLink = split | |
} | |
lastNewNode = split | |
} | |
remainingSuffixCount-- | |
if activeNode == root && activeLength > 0 { | |
activeLength-- | |
activeEdge = pos - remainingSuffixCount + 1 | |
} else if activeNode != root { | |
activeNode = activeNode.suffixLink | |
} | |
} | |
} | |
func setSuffixIndexByDFS(n *Node, labelHeight int) { | |
if n == nil { | |
return | |
} | |
if n.start != -1 { | |
// Uncomment line below to print suffix tree | |
// log.Print(text[n.start: *(n.end) +1]) | |
} | |
leaf := 1 | |
for i := 0; i < maxChar; i++ { | |
if n.children[i] != nil { | |
// Uncomment the 3 lines below to print suffix index | |
//if leaf == 1 && n.start != -1 { | |
// log.Printf(" [%d]\n", n.suffixIndex) | |
//} | |
leaf = 0 | |
setSuffixIndexByDFS(n.children[i], labelHeight+edgeLength(n.children[i])) | |
} | |
} | |
if leaf == 1 { | |
n.suffixIndex = size - labelHeight | |
// Uncomment line below to print suffix index | |
//log.Printf(" [%d]\n", n.suffixIndex) | |
} | |
} | |
func buildSuffixTree() { | |
size = len(text) | |
temp := -1 | |
rootEnd = &temp | |
root = newNode(-1, rootEnd) | |
activeNode = root | |
for i := 0; i < size; i++ { | |
extendSuffixTree(i) | |
} | |
labelHeight := 0 | |
setSuffixIndexByDFS(root, labelHeight) | |
} | |
func doTraversal(n *Node, labelHeight int, maxHeight, substringStartIndex *int) { | |
if n == nil { | |
return | |
} | |
if n.suffixIndex == -1 { | |
for i := 0; i < maxChar; i++ { | |
if n.children[i] != nil { | |
doTraversal(n.children[i], labelHeight+edgeLength(n.children[i]), | |
maxHeight, substringStartIndex) | |
} | |
} | |
} else if n.suffixIndex > -1 && (*maxHeight < labelHeight-edgeLength(n)) { | |
*maxHeight = labelHeight - edgeLength(n) | |
*substringStartIndex = n.suffixIndex | |
} | |
} | |
func getLongestRepeatedSubstringRegex(s string) string { | |
maxHeight := 0 | |
substringStartIndex := 0 | |
doTraversal(root, 0, &maxHeight, &substringStartIndex) | |
// Uncomment line below to print maxHeight and substringStartIndex | |
// log.Printf("maxHeight %d, substringStartIndex %d\n", maxHeight, substringStartIndex) | |
/* | |
if s == "" { | |
log.Printf(" %s is: ", text) | |
} else { | |
log.Printf(" %s is: ", s) | |
} | |
*/ | |
if maxHeight == 0 { | |
log.Print("No repeated substring") | |
return "" | |
} else { | |
output := text[substringStartIndex:substringStartIndex+maxHeight] | |
leading_regex := regexp.MustCompile(`^\([^)]+?\)`) | |
trailing_regex := regexp.MustCompile(`\([^)]+?\)\z`) | |
output = leading_regex.ReplaceAllString(output, "") | |
output = trailing_regex.ReplaceAllString(output, "") | |
return output | |
} | |
} | |
func main() { | |
bytes, err := ioutil.ReadAll(os.Stdin) | |
if err != nil { | |
log.Println(err) | |
} else { | |
input := string(bytes) | |
regex := regexp.MustCompile(`[^\p{L}\p{N}_]+`) | |
text = regex.ReplaceAllString(input, "([^\\p{L}\\p{N}_]+?)") + "$" | |
buildSuffixTree() | |
output := getLongestRepeatedSubstringRegex("") | |
if output != "" { | |
log.Printf("regex:\n(?s)%s", output) | |
} | |
fmt.Print(input) | |
} | |
} | |
func test() { | |
tests := []string{ | |
"GEEKSFORGEEKS$", | |
"AAAAAAAAAA$", | |
"ABCDEFG$", | |
"ABABABA$", | |
"ATCGATCGA$", | |
"banana$", | |
"abcpqrabpqpq$", | |
"pqrpqpqabab$", | |
} | |
log.Println("Longest Repeated Substring in:\n") | |
for _, test := range tests { | |
text = test | |
buildSuffixTree() | |
getLongestRepeatedSubstringRegex("") | |
} | |
fmt.Print(text) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment