Skip to content

Instantly share code, notes, and snippets.

@kaja47
Created August 8, 2012 01:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kaja47/3291354 to your computer and use it in GitHub Desktop.
Save kaja47/3291354 to your computer and use it in GitHub Desktop.
Cosine similarity
type WordFreq = Map[String, Int]
def dot(a: WordFreq, b: WordFreq): Double =
a map { case (k, v) => if (b contains k) v * b(k) else 0 } sum
def magnitude(ws: WordFreq): Double =
math.sqrt(ws.values map (x => x * x toDouble) sum)
def cossim(a: WordFreq, b: WordFreq): Double =
dot(a, b) / (magnitude(a) * magnitude(b))
def metric(cossim: Double): Double =
1 - 2 * math.acos(cossim) / math.Pi
val threadsData = io.Source.fromFile("r9k-cossim.data").getLines drop 1 take 500
val threads: IndexedSeq[(Int, WordFreq)] = threadsData map { line =>
val Array(id, txt) = line split ("\t", 2)
val words = txt.toLowerCase replaceAll ("\\[ntr\\]", " ") split "\\W+" filter { w => w.length >= 3 && !(w matches "\\d+") }
val frequency = words groupBy identity mapValues (_.size)
id.toInt -> frequency
} toIndexedSeq
val similarities =
for (Seq((aId, aWords), (bId, bWords)) <- threads combinations 2)
yield (aId, bId, metric(cossim(aWords, bWords)))
similarities.toSeq filter { case (_, _, sim) => sim > 0.85 } foreach println
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment