Last active
May 22, 2019 15:54
-
-
Save kumarde/96964d265706ca5d889a0dba55bcfad4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""This file handles the interface with the CMU word -> phoneme service.""" | |
import requests | |
import json | |
import defaults as d | |
import os | |
COOKIES = { | |
'__utma': '44984886.2147423455.1500652902.1500652902.1500652902.1', | |
'__utmz': '44984886.1500652902.1.1.utmccn=(referral)|utmcsr=google.com|utmcct=/|utmcmd=referral', | |
'_ga': 'GA1.2.858928425.1500652902', | |
'_gid': 'GA1.2.1514959829.1509572636', | |
'__utmt': '1', | |
'__utmb': '100617052.2.10.1509725212', | |
'__utmc': '100617052', | |
} | |
HEADERS = { | |
'Origin': 'http://www.speech.cs.cmu.edu', | |
'Accept-Encoding': 'gzip, deflate', | |
'Accept-Language': 'en-US,en;q=0.9', | |
'Upgrade-Insecure-Requests': '1', | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) ' | |
'Chrome/62.0.3202.75 Safari/537.36', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', | |
'Cache-Control': 'max-age=0', | |
'Referer': 'http://www.speech.cs.cmu.edu/tools/lextool.html', | |
'Connection': 'keep-alive', | |
} | |
# This holds the results of previous queries to the CMU service so we do not have | |
# to repeat queries on words we have already queried | |
CACHE = d.CMU_CACHE_FILE | |
# This takes the results array and gives back a words -> phonemes mapping which | |
# is guaranteed to cover all *valid* words in the results array | |
def get_updated_phonemes(results_array): | |
cmu_file = open('tosend.txt', 'w') | |
known_phonemes = read_cache() | |
for row in results_array: | |
cmu_file.write(row[d.UNDERSTOOD] + '\n') | |
cmu_file.write(row[d.EXPECTED] + '\n') | |
cmu_file.close() | |
known_phonemes = get_phonemes('tosend.txt', known_phonemes) | |
write_cache(known_phonemes) | |
os.remove('tosend.txt') | |
return known_phonemes | |
def read_cache(): | |
try: | |
f = open(CACHE, 'r') | |
except IOError as e: | |
print(e, '\nError opening CMU cache file for reading: ' + CACHE) | |
return None | |
known = json.loads(f.read()) | |
f.close() | |
return known | |
def write_cache(known): | |
try: | |
f = open(CACHE, 'w') | |
except IOError: | |
print('Error opening CMU cache file for writing: ' + CACHE) | |
return False | |
json.dump(known, f) | |
f.close() | |
return True | |
def write_other_cache(known): | |
try: | |
f = open('common_cache.json', 'w') | |
except IOError: | |
print('Error opening CMU cache file for writing: ') | |
return False | |
json.dump(known, f) | |
f.close() | |
return True | |
def get_phonemes(fname, known): | |
misses = 0 | |
hits = 0 | |
ignored = 0 | |
queries = 0 | |
to_send = '' | |
f = open(fname, 'r') | |
for l in f: | |
if l.upper().strip('\n') == '': | |
continue | |
# Check to see if we already have this line in the cache | |
if not l.upper().strip('\n') in known and not d.exclude(l.strip('\n')): | |
if ')' not in l and '(' not in l: | |
try: | |
l.encode('ascii', 'replace') | |
print('Cache miss: ' + l) | |
to_send += l | |
misses += 1 | |
except UnicodeDecodeError: | |
# Got a weird character | |
# print("Excluding entry:", l) | |
ignored += 1 | |
else: | |
# print("Excluding entry:", l) | |
ignored += 1 | |
else: | |
hits += 1 | |
if misses % d.CMU_QUERY_SIZE == 0 and misses != 0: | |
resp = send_query(to_send) | |
known = parse_response(resp, known) | |
queries += 1 | |
misses += 1 | |
to_send = '' | |
# One more query to finish off any stragglers | |
if to_send != '': | |
resp = send_query(to_send) | |
known = parse_response(resp, known) | |
queries += 1 | |
print('Submitted %d queries to the CMU service (%d hits, %d misses, %d ignored)' % (queries, hits, misses, ignored)) | |
f.close() | |
return known | |
def parse_response(resp, known): | |
for line in resp.text.split('\n'): | |
parts = line.split('\t') | |
if len(parts) < 2: | |
continue | |
word = str(parts[0]) | |
if len(word) == 0: | |
continue | |
if parts[0][-1] == ')': | |
word = parts[0].split('(')[0] | |
known.setdefault(word, []).append([str(x.encode('ascii', 'replace')) for x in parts[1].split()]) | |
return known | |
def send_query(to_send): | |
cookies = { | |
'__utma': '44984886.2147423455.1500652902.1500652902.1500652902.1', | |
'__utmz': '44984886.1500652902.1.1.utmccn=(referral)|utmcsr=google.com|utmcct=/|utmcmd=referral', | |
'_ga': 'GA1.2.858928425.1500652902', | |
'_gid': 'GA1.2.1514959829.1509572636', | |
'__utmt': '1', | |
'__utmb': '100617052.2.10.1509725212', | |
'__utmc': '100617052', | |
} | |
headers = { | |
'Origin': 'http://www.speech.cs.cmu.edu', | |
'Accept-Encoding': 'gzip, deflate', | |
'Accept-Language': 'en-US,en;q=0.9', | |
'Upgrade-Insecure-Requests': '1', | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) ' | |
'Chrome/62.0.3202.75 Safari/537.36', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', | |
'Cache-Control': 'max-age=0', | |
'Referer': 'http://www.speech.cs.cmu.edu/tools/lextool.html', | |
'Connection': 'keep-alive', | |
} | |
file_stuff = { | |
'wordfile': to_send, | |
'handfile': '' | |
} | |
resp = requests.post('http://www.speech.cs.cmu.edu/cgi-bin/tools/logios/lextool.pl', headers=headers, | |
cookies=cookies, files=file_stuff) | |
dict_link = resp.text.find("DICT") + 5 | |
dict_link = resp.text[dict_link:resp.text.find('-->', dict_link) - 2] | |
resp = requests.get(dict_link) | |
return resp |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment