Last active
September 24, 2019 10:11
-
-
Save gliush/f8698943082e2ba98f66dc1b447f9965 to your computer and use it in GitHub Desktop.
Find similar words in the file with "Levenshtein Distance" algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// For more info read https://en.wikipedia.org/wiki/Levenshtein_distance | |
// Kudos to the original LD implementation: | |
// https://www.golangprograms.com/golang-program-for-implementation-of-levenshtein-distance.html | |
package main | |
import ( | |
"bufio" | |
"fmt" | |
"os" | |
"strconv" | |
"strings" | |
"container/heap" | |
) | |
func levenshtein(str1, str2 []rune) int { | |
s1len := len(str1) | |
s2len := len(str2) | |
column := make([]int, len(str1)+1) | |
for y := 1; y <= s1len; y++ { | |
column[y] = y | |
} | |
for x := 1; x <= s2len; x++ { | |
column[0] = x | |
lastkey := x - 1 | |
for y := 1; y <= s1len; y++ { | |
oldkey := column[y] | |
var incr int | |
if str1[y-1] != str2[x-1] { | |
incr = 1 | |
} | |
column[y] = minimum(column[y]+1, column[y-1]+1, lastkey+incr) | |
lastkey = oldkey | |
} | |
} | |
return column[s1len] | |
} | |
func minimum(a, b, c int) int { | |
if a < b { | |
if a < c { | |
return a | |
} | |
} else { | |
if b < c { | |
return b | |
} | |
} | |
return c | |
} | |
type distance struct { | |
score int | |
word1 string | |
word2 string | |
} | |
type DistanceHeap []distance | |
func (h DistanceHeap) Len() int { return len(h) } | |
func (h DistanceHeap) Less(i, j int) bool { return h[i].score < h[j].score } | |
func (h DistanceHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } | |
func (h *DistanceHeap) Push(x interface{}) { | |
*h = append(*h, x.(distance)) | |
} | |
func (h *DistanceHeap) Pop() interface{} { | |
old := *h | |
n := len(old) | |
x := old[n-1] | |
*h = old[0 : n-1] | |
return x | |
} | |
func main() { | |
if len(os.Args) < 2 { | |
fmt.Println("Program to find similar words in the file using Levenshtein Difference algorithm") | |
fmt.Println("All spaces, quotes will be trimmed") | |
fmt.Println("") | |
fmt.Println("Usage:") | |
fmt.Println(" go run levenshtein.go <FILE> [<MAX-DISTANCE>]") | |
fmt.Println("") | |
fmt.Println("Options:") | |
fmt.Println(" <FILE> - a file to read words from") | |
fmt.Println(" <MAX-DISTANCE> - maximum distance between words to print, default=2") | |
os.Exit(1) | |
} | |
var err error | |
allServicesFileName := os.Args[1] | |
maxDistance := 2 | |
if len(os.Args) >= 3 { | |
maxDistance, err = strconv.Atoi(os.Args[2]) | |
if err != nil { | |
panic(err) | |
} | |
} | |
words, err := readWords(allServicesFileName) | |
if err != nil { | |
panic(err) | |
} | |
distanceHeap := calcDistances(words, maxDistance) | |
for distanceHeap.Len() > 0 { | |
d := heap.Pop(distanceHeap).(distance) | |
fmt.Printf("%#v\n", d) | |
} | |
} | |
func readWords(filename string) ([]string, error) { | |
file, err := os.Open(filename) | |
if err != nil { | |
return nil, err | |
} | |
defer file.Close() | |
var words []string | |
scanner := bufio.NewScanner(file) | |
for scanner.Scan() { | |
word := strings.Trim(scanner.Text(), "\"' ") | |
if len(word) == 0 { | |
continue | |
} | |
words = append(words, word) | |
} | |
return words, nil | |
} | |
func calcDistances(words []string, maxDistance int) *DistanceHeap { | |
distanceHeap := &DistanceHeap{} | |
heap.Init(distanceHeap) | |
for i, word1 := range words { | |
for _, word2 := range words[:i] { | |
dist := distance{ | |
score: levenshtein([]rune(word1), []rune(word2)), | |
word1: string(word1), | |
word2: string(word2), | |
} | |
if dist.score <= maxDistance { | |
heap.Push(distanceHeap, dist) | |
} | |
} | |
} | |
return distanceHeap | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment