Skip to content

Instantly share code, notes, and snippets.

@Reldan
Created March 5, 2014 12:36
package com.digsolab.shishishi
import java.io._
import scala.collection.concurrent.TrieMap
import scala.collection.immutable
object Dictionary {
type NormalizedWord = String
val dictionary = TrieMap.empty[String, TrieMap[String, Any]]
val subForms = TrieMap.empty[String, String]
def loadPlainFromStream(stream: InputStream) {
val dictLines = io.Source.fromInputStream(stream).getLines()
for (line <- dictLines) {
val normalForm = dictLines.take(1).map(splitLine).map {
case (form, _) =>
normalizeWord(form)
}.toList.head
val newMap: immutable.Map[String, Any] = dictLines.takeWhile(!_.isEmpty).map(splitLine).map {
case (form, _) =>
normalizeWord(form) → null
}.toMap
val myMap = new TrieMap()
val t = myMap ++ newMap
subForms ++= t.map(el ⇒ normalizeWord(el._1) → normalForm)
dictionary += (normalForm → t)
}
}
def size = dictionary.size
def get(word: String) = dictionary.get(normalizeWord(word))
def normalizeWord(word: String) =
word.toUpperCase.replace('Ё', 'Е')
private def splitLine(line: String) =
line.split("\t") match {
case Array(form, grammeme) => (form, grammeme)
case _ => ("", "")
}
}
object Shishishi extends App {
val bzipDictName = "/Users/reldan/Copy/myprogs/dict.opcorpora.txt"
// Dictionary.loadPlainFromBz2(bzipDictName)
val file = new BufferedInputStream(new FileInputStream(bzipDictName))
Dictionary.loadPlainFromStream(file)
println(s"Dictionary size: ${Dictionary.size} nodes")
// Thread.sleep(10000)
val myargs = List("стали", "стать", "ставших", "абвгд")
myargs foreach { word =>
val hujhord = Dictionary.normalizeWord(word)
println(s"Search the word ${word}")
Dictionary.get(hujhord) match {
case Some(nodes) =>
println(s"Found the word: $word, isNormalForm=true")
println("All forms:")
println(nodes)
println(nodes.mkString(" , "))
case _ =>
Dictionary.subForms.get(hujhord) match {
case Some(normalForm) =>
println(s"Found the word: $normalForm, isNormalForm=true")
println("All forms:")
println(Dictionary.get(normalForm).get.keySet.mkString(" , "))
case _ ⇒
println("Could not find the word.")
}
}
System.gc()
System.out.println("Free memory (bytes): " + Runtime.getRuntime().freeMemory())
val mb = 1024 * 1024;
//Getting the runtime reference from system
val runtime = Runtime.getRuntime()
System.out.println("##### Heap utilization statistics [MB] #####")
//Print used memory
System.out.println("Used Memory:"
+ (runtime.totalMemory() - runtime.freeMemory()) / mb)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment