Skip to content

Instantly share code, notes, and snippets.

@lhw
Last active October 5, 2015 10:48
Show Gist options
  • Save lhw/a0e741f38eced36e4c42 to your computer and use it in GitHub Desktop.
Save lhw/a0e741f38eced36e4c42 to your computer and use it in GitHub Desktop.
Dice coefficient for string similarity on natural text
/*
Dice coefficient for string similarity on natural text
Playground: http://play.golang.org/p/JFUjcpBb66
*/
package dicecoefficient
import "strings"
import "unicode"
func calculateSimilarity(str1, str2 string) float32 {
if str1 == str2 {
return 1.0
}
//get bigrams excluding whitespace
pairs := func(str string) []bigram {
runes := []rune(strings.ToLower(str))
n := len(runes) - 1
pairs := make([]bigram, n)
e := 0
for i := 0; i < n; i++ {
if !unicode.IsSpace(runes[i]) && !unicode.IsSpace(runes[i+1]) {
copy(pairs[e][:], runes[i:i+2])
e++
}
}
return pairs[0:e]
}
pairs1, pairs2 := pairs(str1), pairs(str2)
i := 0
for _, p1 := range pairs1 {
for p2, _ := range pairs2 {
if p1 == pairs2[p2] {
i++
//exclude already found pairs on the next run
pairs2[p2] = bigram{}
break
}
}
}
return (2.0 * float32(i)) / (float32(len(pairs1) + len (pairs2)))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment