Skip to content

Instantly share code, notes, and snippets.

@kumarde
Last active May 22, 2019 15:54
Show Gist options
  • Save kumarde/96964d265706ca5d889a0dba55bcfad4 to your computer and use it in GitHub Desktop.
Save kumarde/96964d265706ca5d889a0dba55bcfad4 to your computer and use it in GitHub Desktop.
"""This file handles the interface with the CMU word -> phoneme service."""
import requests
import json
import defaults as d
import os
COOKIES = {
'__utma': '44984886.2147423455.1500652902.1500652902.1500652902.1',
'__utmz': '44984886.1500652902.1.1.utmccn=(referral)|utmcsr=google.com|utmcct=/|utmcmd=referral',
'_ga': 'GA1.2.858928425.1500652902',
'_gid': 'GA1.2.1514959829.1509572636',
'__utmt': '1',
'__utmb': '100617052.2.10.1509725212',
'__utmc': '100617052',
}
HEADERS = {
'Origin': 'http://www.speech.cs.cmu.edu',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/62.0.3202.75 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'Referer': 'http://www.speech.cs.cmu.edu/tools/lextool.html',
'Connection': 'keep-alive',
}
# This holds the results of previous queries to the CMU service so we do not have
# to repeat queries on words we have already queried
CACHE = d.CMU_CACHE_FILE
# This takes the results array and gives back a words -> phonemes mapping which
# is guaranteed to cover all *valid* words in the results array
def get_updated_phonemes(results_array):
cmu_file = open('tosend.txt', 'w')
known_phonemes = read_cache()
for row in results_array:
cmu_file.write(row[d.UNDERSTOOD] + '\n')
cmu_file.write(row[d.EXPECTED] + '\n')
cmu_file.close()
known_phonemes = get_phonemes('tosend.txt', known_phonemes)
write_cache(known_phonemes)
os.remove('tosend.txt')
return known_phonemes
def read_cache():
try:
f = open(CACHE, 'r')
except IOError as e:
print(e, '\nError opening CMU cache file for reading: ' + CACHE)
return None
known = json.loads(f.read())
f.close()
return known
def write_cache(known):
try:
f = open(CACHE, 'w')
except IOError:
print('Error opening CMU cache file for writing: ' + CACHE)
return False
json.dump(known, f)
f.close()
return True
def write_other_cache(known):
try:
f = open('common_cache.json', 'w')
except IOError:
print('Error opening CMU cache file for writing: ')
return False
json.dump(known, f)
f.close()
return True
def get_phonemes(fname, known):
misses = 0
hits = 0
ignored = 0
queries = 0
to_send = ''
f = open(fname, 'r')
for l in f:
if l.upper().strip('\n') == '':
continue
# Check to see if we already have this line in the cache
if not l.upper().strip('\n') in known and not d.exclude(l.strip('\n')):
if ')' not in l and '(' not in l:
try:
l.encode('ascii', 'replace')
print('Cache miss: ' + l)
to_send += l
misses += 1
except UnicodeDecodeError:
# Got a weird character
# print("Excluding entry:", l)
ignored += 1
else:
# print("Excluding entry:", l)
ignored += 1
else:
hits += 1
if misses % d.CMU_QUERY_SIZE == 0 and misses != 0:
resp = send_query(to_send)
known = parse_response(resp, known)
queries += 1
misses += 1
to_send = ''
# One more query to finish off any stragglers
if to_send != '':
resp = send_query(to_send)
known = parse_response(resp, known)
queries += 1
print('Submitted %d queries to the CMU service (%d hits, %d misses, %d ignored)' % (queries, hits, misses, ignored))
f.close()
return known
def parse_response(resp, known):
for line in resp.text.split('\n'):
parts = line.split('\t')
if len(parts) < 2:
continue
word = str(parts[0])
if len(word) == 0:
continue
if parts[0][-1] == ')':
word = parts[0].split('(')[0]
known.setdefault(word, []).append([str(x.encode('ascii', 'replace')) for x in parts[1].split()])
return known
def send_query(to_send):
cookies = {
'__utma': '44984886.2147423455.1500652902.1500652902.1500652902.1',
'__utmz': '44984886.1500652902.1.1.utmccn=(referral)|utmcsr=google.com|utmcct=/|utmcmd=referral',
'_ga': 'GA1.2.858928425.1500652902',
'_gid': 'GA1.2.1514959829.1509572636',
'__utmt': '1',
'__utmb': '100617052.2.10.1509725212',
'__utmc': '100617052',
}
headers = {
'Origin': 'http://www.speech.cs.cmu.edu',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/62.0.3202.75 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'Referer': 'http://www.speech.cs.cmu.edu/tools/lextool.html',
'Connection': 'keep-alive',
}
file_stuff = {
'wordfile': to_send,
'handfile': ''
}
resp = requests.post('http://www.speech.cs.cmu.edu/cgi-bin/tools/logios/lextool.pl', headers=headers,
cookies=cookies, files=file_stuff)
dict_link = resp.text.find("DICT") + 5
dict_link = resp.text[dict_link:resp.text.find('-->', dict_link) - 2]
resp = requests.get(dict_link)
return resp
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment