Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Getting triphones from CMUDict pronunciations in NLTK
import re
from collections import defaultdict
import nltk
from nltk.corpus import cmudict
def clean_pron(pron):
"""Remove stress from pronunciations."""
return re.sub(r"\d", "", pron)
def make_triphones(pron):
"""Output triphones from a word's pronunciation."""
if len(pron) < 3:
return []
# Junk on end is to make word boundaries work
return ([((pron[idx - 2], pron[idx - 1]), pron[idx])
for idx in range(2, len(pron))] + [(('#', '#'), pron[0])] +
[((pron[-2], pron[-1]), '#')])
def triphone_probs(prons):
"""Calculate triphone probabilities for pronunciations."""
context_counts = defaultdict(lambda: defaultdict(int))
for pron in prons:
for (context, phoneme) in make_triphones(pron):
context_counts[context][phoneme] += 1
for (context, outcomes) in context_counts.items():
total_outcomes = sum(outcomes.values())
for outcome, count in outcomes.items():
context_counts[context][outcome] = float(count) / total_outcomes
return context_counts
def main():
"""Compute some triphone probabilities."""
pron_dict = cmudict.dict()
prons = (map(clean_pron, pron) for prons in pron_dict.values()
for pron in prons)
triphones = triphone_probs(prons)
context = ('IH', 'NG')
outcomes = triphones[context]
print context
sorted_outcomes = sorted(outcomes.items(), key=lambda x: x[1], reverse=True)
for outcome in sorted_outcomes:
print "%s: %.4f" % outcome
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment