Skip to content

Instantly share code, notes, and snippets.

@gliush
Last active September 24, 2019 10:11
Show Gist options
  • Save gliush/f8698943082e2ba98f66dc1b447f9965 to your computer and use it in GitHub Desktop.
Save gliush/f8698943082e2ba98f66dc1b447f9965 to your computer and use it in GitHub Desktop.
Find similar words in the file with "Levenshtein Distance" algorithm
// For more info read https://en.wikipedia.org/wiki/Levenshtein_distance
// Kudos to the original LD implementation:
// https://www.golangprograms.com/golang-program-for-implementation-of-levenshtein-distance.html
package main
import (
"bufio"
"fmt"
"os"
"strconv"
"strings"
"container/heap"
)
func levenshtein(str1, str2 []rune) int {
s1len := len(str1)
s2len := len(str2)
column := make([]int, len(str1)+1)
for y := 1; y <= s1len; y++ {
column[y] = y
}
for x := 1; x <= s2len; x++ {
column[0] = x
lastkey := x - 1
for y := 1; y <= s1len; y++ {
oldkey := column[y]
var incr int
if str1[y-1] != str2[x-1] {
incr = 1
}
column[y] = minimum(column[y]+1, column[y-1]+1, lastkey+incr)
lastkey = oldkey
}
}
return column[s1len]
}
func minimum(a, b, c int) int {
if a < b {
if a < c {
return a
}
} else {
if b < c {
return b
}
}
return c
}
type distance struct {
score int
word1 string
word2 string
}
type DistanceHeap []distance
func (h DistanceHeap) Len() int { return len(h) }
func (h DistanceHeap) Less(i, j int) bool { return h[i].score < h[j].score }
func (h DistanceHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *DistanceHeap) Push(x interface{}) {
*h = append(*h, x.(distance))
}
func (h *DistanceHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}
func main() {
if len(os.Args) < 2 {
fmt.Println("Program to find similar words in the file using Levenshtein Difference algorithm")
fmt.Println("All spaces, quotes will be trimmed")
fmt.Println("")
fmt.Println("Usage:")
fmt.Println(" go run levenshtein.go <FILE> [<MAX-DISTANCE>]")
fmt.Println("")
fmt.Println("Options:")
fmt.Println(" <FILE> - a file to read words from")
fmt.Println(" <MAX-DISTANCE> - maximum distance between words to print, default=2")
os.Exit(1)
}
var err error
allServicesFileName := os.Args[1]
maxDistance := 2
if len(os.Args) >= 3 {
maxDistance, err = strconv.Atoi(os.Args[2])
if err != nil {
panic(err)
}
}
words, err := readWords(allServicesFileName)
if err != nil {
panic(err)
}
distanceHeap := calcDistances(words, maxDistance)
for distanceHeap.Len() > 0 {
d := heap.Pop(distanceHeap).(distance)
fmt.Printf("%#v\n", d)
}
}
func readWords(filename string) ([]string, error) {
file, err := os.Open(filename)
if err != nil {
return nil, err
}
defer file.Close()
var words []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
word := strings.Trim(scanner.Text(), "\"' ")
if len(word) == 0 {
continue
}
words = append(words, word)
}
return words, nil
}
func calcDistances(words []string, maxDistance int) *DistanceHeap {
distanceHeap := &DistanceHeap{}
heap.Init(distanceHeap)
for i, word1 := range words {
for _, word2 := range words[:i] {
dist := distance{
score: levenshtein([]rune(word1), []rune(word2)),
word1: string(word1),
word2: string(word2),
}
if dist.score <= maxDistance {
heap.Push(distanceHeap, dist)
}
}
}
return distanceHeap
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment