Last active
October 5, 2015 10:48
-
-
Save lhw/a0e741f38eced36e4c42 to your computer and use it in GitHub Desktop.
Dice coefficient for string similarity on natural text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Dice coefficient for string similarity on natural text | |
Playground: http://play.golang.org/p/JFUjcpBb66 | |
*/ | |
package dicecoefficient | |
import "strings" | |
import "unicode" | |
func calculateSimilarity(str1, str2 string) float32 { | |
if str1 == str2 { | |
return 1.0 | |
} | |
//get bigrams excluding whitespace | |
pairs := func(str string) []bigram { | |
runes := []rune(strings.ToLower(str)) | |
n := len(runes) - 1 | |
pairs := make([]bigram, n) | |
e := 0 | |
for i := 0; i < n; i++ { | |
if !unicode.IsSpace(runes[i]) && !unicode.IsSpace(runes[i+1]) { | |
copy(pairs[e][:], runes[i:i+2]) | |
e++ | |
} | |
} | |
return pairs[0:e] | |
} | |
pairs1, pairs2 := pairs(str1), pairs(str2) | |
i := 0 | |
for _, p1 := range pairs1 { | |
for p2, _ := range pairs2 { | |
if p1 == pairs2[p2] { | |
i++ | |
//exclude already found pairs on the next run | |
pairs2[p2] = bigram{} | |
break | |
} | |
} | |
} | |
return (2.0 * float32(i)) / (float32(len(pairs1) + len (pairs2))) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment