Skip to content

Instantly share code, notes, and snippets.

@srubin
Last active October 19, 2018 16:29
Show Gist options
  • Save srubin/5139432 to your computer and use it in GitHub Desktop.
Save srubin/5139432 to your computer and use it in GitHub Desktop.
Get arpabet pronunciation for a list of words (will also guess pronunciation if it's not in the CMU Sphinx dictionary). This basically just calls a CMU Sphinx tool online (lmtool: http://www.speech.cs.cmu.edu/tools/lmtool-new.html) and parses the results.
"""
Get the arpabet pronunciation of a set of words, courtesy
of the CMU Sphinx pronunciation dictionary (and their
tools to determine the pronunciation of unknown words).
Usage: create a pronounce object, add words to pronounce object
run .p()
Command line: python pronunciation.py list of words to pronounce
Copyright 2013 - Steven Rubin - srubin@cs.berkeley.edu
MIT License
"""
import requests
import sys
import re
import string
class Pronounce(object):
url = "http://www.speech.cs.cmu.edu/cgi-bin/tools/lmtool/run"
dict_re = re.compile(r"\d+\.dic")
other_pr = re.compile(r"(.*)\(\d+\)$")
vowel_re = re.compile(r"AA|AE|AH|AO|AW|AY|EH|ER|EY|IH|IY|OW|OY|UH|UW")
def __init__(self, words=None):
if words:
self.words = words
else:
self.words = []
def add(self, word):
self.words.append(word)
def p(self, add_fake_stress=False):
w_upper = [unicode(w).upper() for w in self.words]
punc_map = dict((ord(c), None) for c in string.punctuation)
w_nopunc = [s.translate(punc_map) for s in w_upper]
file = {'corpus': ('words.txt', " ".join(w_nopunc))}
res = requests.post(Pronounce.url,
data={"formtype": "simple"},
files=file, allow_redirects=True)
base_url = res.url
text = res.text
dict_path = Pronounce.dict_re.search(text).group(0)
res = requests.get(base_url + dict_path)
# generate output dict
pronunciations = {}
for line in res.text.split('\n'):
if len(line) > 0:
pr = line.split('\t')
match = Pronounce.other_pr.match(pr[0])
if match:
pr[0] = match.group(1)
idx = w_nopunc.index(pr[0])
orig = self.words[idx]
upword = w_upper[idx]
if add_fake_stress:
pr[1] = re.sub(Pronounce.vowel_re, r"\g<0>0", pr[1])
if orig in pronunciations:
pronunciations[orig].append(pr[1])
else:
pronunciations[orig] = [upword, pr[1]]
return pronunciations
if __name__ == '__main__':
pr = Pronounce(sys.argv[1:])
print pr.p()
@Ranjan13
Copy link

Hey!
I have been trying to use it but it seems the URL aint working anymore. Any idea regarding alternate urls?

@schwade
Copy link

schwade commented Oct 19, 2018

Thank you for this! It works perfectly. I also really appreciate the add_fake_stress function. Saved me some extra work!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment