Created
October 26, 2018 12:28
-
-
Save hvesalai/e0f0671acf94518c58fc8e267af89d86 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import com.onomatics.phonetic.Syllable | |
import com.onomatics.phonetic.Syllable.SyllableVector | |
import com.onomatics.phonetic.Phoneme.loadSymbolFeatures | |
def caRaw(syllable: String) = { | |
val s = SyllableVector.parse(syllable, featuresBySymbol) | |
val (onsetCore, onsetAffix) = Syllable.splitCoreAffix(s.onsetFull, false) | |
val (codaCore, codaAffix) = Syllable.splitCoreAffix(s.codaFull, true) | |
List(onsetAffix.reverse, onsetCore.reverse, codaCore, codaAffix) | |
} | |
val featuresBySymbol = loadSymbolFeatures("phonemic/en/symbol_features.tsv") | |
val cmuFile = "/opt/cmudict/cmudict-kh.rep" | |
val pronunciations = | |
scala.io.Source.fromFile(cmuFile).getLines.filterNot(_.startsWith("#")). | |
map(_.split(" ",2)). | |
collect { | |
case Array(word, phonetics) => | |
phonetics.replaceAll("[0-9]", "").split("-").map(_.trim) | |
} | |
val syllables = pronunciations.toSet.flatten | |
val structure = syllables map caRaw | |
def printGrouped(i: Int) = | |
structure.map(_(i)). | |
groupBy(_.flatMap(x => x.toString.replaceAll("[VC]\\([A-Z]*_[0-9]*, ", "").dropRight(1).split(", ").toSet).toSet). | |
map { | |
case (k,v) => | |
k.mkString(" ") -> v.map(_.map(_.text).mkString(" ")).mkString(";") | |
} foreach { case (k, v) => println(f"$k%80s\t$v")} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment