Skip to content

Instantly share code, notes, and snippets.

@pathikrit
Created March 2, 2017 15:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pathikrit/144778a5d321b95c9f2f7a18d38eac83 to your computer and use it in GitHub Desktop.
Save pathikrit/144778a5d321b95c9f2f7a18d38eac83 to your computer and use it in GitHub Desktop.
Thread-safe k-shingling in Scala
import scala.collection.mutable
/**
* A thread-safe Scala port of the KShingling implementation of https://github.com/tdebatty/java-string-similarity
*/
class KShingling(k: Int = 3) {self =>
private[this] val shingles = mutable.Map.empty[String, Int]
def profile(s: String) = {
val freq = mutable.Map.empty[Int, Int] withDefaultValue 0
(0 to (s.length - k)) foreach {i =>
val t = s.substring(i, i + k)
val pos = shingles.synchronized(shingles.getOrElseUpdate(t, shingles.size))
freq(pos) += 1
}
new SparseVector(freq)
}
class SparseVector(freq: mutable.Map[Int, Int]) {
val size = freq.size
private[SparseVector] val keys, values = Array.ofDim[Int](size)
private[this] var c = 0
private[SparseVector] var norm = 0.0
freq.toSeq.sorted foreach {case (k, v) =>
keys(c) = k
values(c) = v
norm += 1.0 * v * v
c += 1
}
norm = Math.sqrt(norm)
def cosineSimilarity(that: self.SparseVector) = {
var i, j = 0
var agg = 0.0
while(i < this.size && j < that.size) {
val k1 = this.keys(i)
val k2 = that.keys(j)
if (k1 < k2) {
i += 1
} else if (k1 > k2) {
j += 1
} else {
agg += 1.0 * this.values(i) * that.values(j)
i += 1
j += 1
}
}
agg/(this.norm * that.norm)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment