Skip to content

Instantly share code, notes, and snippets.

@thomasjungblut
Last active December 16, 2015 06:19
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thomasjungblut/5390600 to your computer and use it in GitHub Desktop.
Save thomasjungblut/5390600 to your computer and use it in GitHub Desktop.
Inverted Index in less than 50 lines of code (and I was verbose!)
package de.jungblut.index
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.MultiMap
import scala.collection.mutable.HashMap
import scala.collection.mutable.Set
final class Index[T](nGramSize: Int) {
private val index = new HashMap[String, Set[T]] with MultiMap[String, T]
def build(content: Map[String, T]) = {
for (docTerm <- content) yield tokenize(docTerm._1).foreach(index.addBinding(_, docTerm._2))
this // return this so people can chain calls to construction and build
}
private def tokenize(sx: String, index: Int = 0): ArrayBuffer[String] = {
if (index >= sx.length() - 1) {
ArrayBuffer[String]() // at the end of the string, get our buffer
} else {
// recurse further into the string
val tokens = tokenize(sx, index + 1)
// if we are far away from the end, add to the tokens
if (index + nGramSize <= sx.length())
return (tokens :+ sx.substring(index, index + nGramSize))
tokens
}
}
def query(query: String, minScore: Double = 0.0) = {
val tokens = tokenize(query)
val documentSets = tokens.flatMap(tokenize(_)).distinct.flatMap(index.get(_))
val counts = documentSets.flatMap(_.toSet).groupBy(token => token).mapValues(_.size)
// now calculate the match score by dividing the counts by the tokens
counts.map(Function.tupled((k, v) => (k, v / tokens.size.doubleValue()))).filter(_._2 > minScore).toList.sortBy(-_._2)
}
}
object Main {
def main(args: Array[String]): Unit = {
val index = new Index(3).build(Map("I'm cool" -> "cool doc", "rofl!!! that was funny" -> "rofl doc", "omg that was funny!11 lol" -> "what a funny doc!"))
println(index.query("funny I'm"))
println(index.query("that was so funny lol"),0.5)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment