Skip to content

Instantly share code, notes, and snippets.

@ConstantineLignos
Created September 15, 2011 16:42
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ConstantineLignos/1219749 to your computer and use it in GitHub Desktop.
Save ConstantineLignos/1219749 to your computer and use it in GitHub Desktop.
Getting triphones from CMUDict pronunciations in NLTK
import re
from collections import defaultdict
import nltk
from nltk.corpus import cmudict
def clean_pron(pron):
"""Remove stress from pronunciations."""
return re.sub(r"\d", "", pron)
def make_triphones(pron):
"""Output triphones from a word's pronunciation."""
if len(pron) < 3:
return []
# Junk on end is to make word boundaries work
return ([((pron[idx - 2], pron[idx - 1]), pron[idx])
for idx in range(2, len(pron))] + [(('#', '#'), pron[0])] +
[((pron[-2], pron[-1]), '#')])
def triphone_probs(prons):
"""Calculate triphone probabilities for pronunciations."""
context_counts = defaultdict(lambda: defaultdict(int))
for pron in prons:
for (context, phoneme) in make_triphones(pron):
context_counts[context][phoneme] += 1
for (context, outcomes) in context_counts.items():
total_outcomes = sum(outcomes.values())
for outcome, count in outcomes.items():
context_counts[context][outcome] = float(count) / total_outcomes
return context_counts
def main():
"""Compute some triphone probabilities."""
pron_dict = cmudict.dict()
prons = (map(clean_pron, pron) for prons in pron_dict.values()
for pron in prons)
triphones = triphone_probs(prons)
context = ('IH', 'NG')
outcomes = triphones[context]
print context
sorted_outcomes = sorted(outcomes.items(), key=lambda x: x[1], reverse=True)
for outcome in sorted_outcomes:
print "%s: %.4f" % outcome
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment