Skip to content

Instantly share code, notes, and snippets.

@hvesalai
Created October 26, 2018 12:28
Show Gist options
  • Save hvesalai/e0f0671acf94518c58fc8e267af89d86 to your computer and use it in GitHub Desktop.
Save hvesalai/e0f0671acf94518c58fc8e267af89d86 to your computer and use it in GitHub Desktop.
import com.onomatics.phonetic.Syllable
import com.onomatics.phonetic.Syllable.SyllableVector
import com.onomatics.phonetic.Phoneme.loadSymbolFeatures
def caRaw(syllable: String) = {
val s = SyllableVector.parse(syllable, featuresBySymbol)
val (onsetCore, onsetAffix) = Syllable.splitCoreAffix(s.onsetFull, false)
val (codaCore, codaAffix) = Syllable.splitCoreAffix(s.codaFull, true)
List(onsetAffix.reverse, onsetCore.reverse, codaCore, codaAffix)
}
val featuresBySymbol = loadSymbolFeatures("phonemic/en/symbol_features.tsv")
val cmuFile = "/opt/cmudict/cmudict-kh.rep"
val pronunciations =
scala.io.Source.fromFile(cmuFile).getLines.filterNot(_.startsWith("#")).
map(_.split(" ",2)).
collect {
case Array(word, phonetics) =>
phonetics.replaceAll("[0-9]", "").split("-").map(_.trim)
}
val syllables = pronunciations.toSet.flatten
val structure = syllables map caRaw
def printGrouped(i: Int) =
structure.map(_(i)).
groupBy(_.flatMap(x => x.toString.replaceAll("[VC]\\([A-Z]*_[0-9]*, ", "").dropRight(1).split(", ").toSet).toSet).
map {
case (k,v) =>
k.mkString(" ") -> v.map(_.map(_.text).mkString(" ")).mkString(";")
} foreach { case (k, v) => println(f"$k%80s\t$v")}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment