kumarde/cmu.py

## cmu.py

"""This file handles the interface with the CMU word -> phoneme service."""

import requests
import json
import defaults as d
import os

COOKIES = {
    '__utma': '44984886.2147423455.1500652902.1500652902.1500652902.1',
    '__utmz': '44984886.1500652902.1.1.utmccn=(referral)|utmcsr=google.com|utmcct=/|utmcmd=referral',
    '_ga': 'GA1.2.858928425.1500652902',
    '_gid': 'GA1.2.1514959829.1509572636',
    '__utmt': '1',
    '__utmb': '100617052.2.10.1509725212',
    '__utmc': '100617052',
}

HEADERS = {
    'Origin': 'http://www.speech.cs.cmu.edu',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'en-US,en;q=0.9',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/62.0.3202.75 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Cache-Control': 'max-age=0',
    'Referer': 'http://www.speech.cs.cmu.edu/tools/lextool.html',
    'Connection': 'keep-alive',
}

# This holds the results of previous queries to the CMU service so we do not have
# to repeat queries on words we have already queried
CACHE = d.CMU_CACHE_FILE


# This takes the results array and gives back a words -> phonemes mapping which
# is guaranteed to cover all *valid* words in the results array
def get_updated_phonemes(results_array):
    cmu_file = open('tosend.txt', 'w')
    known_phonemes = read_cache()
    for row in results_array:
        cmu_file.write(row[d.UNDERSTOOD] + '\n')
        cmu_file.write(row[d.EXPECTED] + '\n')
    cmu_file.close()
    known_phonemes = get_phonemes('tosend.txt', known_phonemes)
    write_cache(known_phonemes)
    os.remove('tosend.txt')
    return known_phonemes


def read_cache():
    try:
        f = open(CACHE, 'r')
    except IOError as e:
        print(e, '\nError opening CMU cache file for reading: ' + CACHE)
        return None

    known = json.loads(f.read())
    f.close()
    return known


def write_cache(known):
    try:
        f = open(CACHE, 'w')
    except IOError:
        print('Error opening CMU cache file for writing: ' + CACHE)
        return False

    json.dump(known, f)
    f.close()
    return True

def write_other_cache(known):
    try:
        f = open('common_cache.json', 'w')
    except IOError:
        print('Error opening CMU cache file for writing: ')
        return False

    json.dump(known, f)
    f.close()
    return True


def get_phonemes(fname, known):
    misses = 0
    hits = 0
    ignored = 0
    queries = 0
    to_send = ''

    f = open(fname, 'r')

    for l in f:
        if l.upper().strip('\n') == '':
            continue
        # Check to see if we already have this line in the cache
        if not l.upper().strip('\n') in known and not d.exclude(l.strip('\n')):
            if ')' not in l and '(' not in l:
                try:
                    l.encode('ascii', 'replace')
                    print('Cache miss: ' + l)
                    to_send += l
                    misses += 1
                except UnicodeDecodeError:
                    # Got a weird character
                    # print("Excluding entry:", l)
                    ignored += 1
            else:
                # print("Excluding entry:", l)
                ignored += 1
        else:
            hits += 1

        if misses % d.CMU_QUERY_SIZE == 0 and misses != 0:
            resp = send_query(to_send)
            known = parse_response(resp, known)
            queries += 1
            misses += 1
            to_send = ''

    # One more query to finish off any stragglers
    if to_send != '':
        resp = send_query(to_send)
        known = parse_response(resp, known)
        queries += 1

    print('Submitted %d queries to the CMU service (%d hits, %d misses, %d ignored)' % (queries, hits, misses, ignored))
    f.close()
    return known


def parse_response(resp, known):
    for line in resp.text.split('\n'):
        parts = line.split('\t')
        if len(parts) < 2:
            continue

        word = str(parts[0])
        if len(word) == 0:
            continue

        if parts[0][-1] == ')':
            word = parts[0].split('(')[0]

        known.setdefault(word, []).append([str(x.encode('ascii', 'replace')) for x in parts[1].split()])

    return known


def send_query(to_send):
    cookies = {
        '__utma': '44984886.2147423455.1500652902.1500652902.1500652902.1',
        '__utmz': '44984886.1500652902.1.1.utmccn=(referral)|utmcsr=google.com|utmcct=/|utmcmd=referral',
        '_ga': 'GA1.2.858928425.1500652902',
        '_gid': 'GA1.2.1514959829.1509572636',
        '__utmt': '1',
        '__utmb': '100617052.2.10.1509725212',
        '__utmc': '100617052',
    }

    headers = {
        'Origin': 'http://www.speech.cs.cmu.edu',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'en-US,en;q=0.9',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/62.0.3202.75 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Cache-Control': 'max-age=0',
        'Referer': 'http://www.speech.cs.cmu.edu/tools/lextool.html',
        'Connection': 'keep-alive',
    }

    file_stuff = {
        'wordfile': to_send,
        'handfile': ''
    }

    resp = requests.post('http://www.speech.cs.cmu.edu/cgi-bin/tools/logios/lextool.pl', headers=headers,
                         cookies=cookies, files=file_stuff)
    dict_link = resp.text.find("DICT") + 5
    dict_link = resp.text[dict_link:resp.text.find('-->', dict_link) - 2]

    resp = requests.get(dict_link)

    return resp

	"""This file handles the interface with the CMU word -> phoneme service."""

	import requests
	import json
	import defaults as d
	import os

	COOKIES = {
	'__utma': '44984886.2147423455.1500652902.1500652902.1500652902.1',
	'__utmz': '44984886.1500652902.1.1.utmccn=(referral)\|utmcsr=google.com\|utmcct=/\|utmcmd=referral',
	'_ga': 'GA1.2.858928425.1500652902',
	'_gid': 'GA1.2.1514959829.1509572636',
	'__utmt': '1',
	'__utmb': '100617052.2.10.1509725212',
	'__utmc': '100617052',
	}

	HEADERS = {
	'Origin': 'http://www.speech.cs.cmu.edu',
	'Accept-Encoding': 'gzip, deflate',
	'Accept-Language': 'en-US,en;q=0.9',
	'Upgrade-Insecure-Requests': '1',
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) '
	'Chrome/62.0.3202.75 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8',
	'Cache-Control': 'max-age=0',
	'Referer': 'http://www.speech.cs.cmu.edu/tools/lextool.html',
	'Connection': 'keep-alive',
	}

	# This holds the results of previous queries to the CMU service so we do not have
	# to repeat queries on words we have already queried
	CACHE = d.CMU_CACHE_FILE


	# This takes the results array and gives back a words -> phonemes mapping which
	# is guaranteed to cover all valid words in the results array
	def get_updated_phonemes(results_array):
	cmu_file = open('tosend.txt', 'w')
	known_phonemes = read_cache()
	for row in results_array:
	cmu_file.write(row[d.UNDERSTOOD] + '\n')
	cmu_file.write(row[d.EXPECTED] + '\n')
	cmu_file.close()
	known_phonemes = get_phonemes('tosend.txt', known_phonemes)
	write_cache(known_phonemes)
	os.remove('tosend.txt')
	return known_phonemes


	def read_cache():
	try:
	f = open(CACHE, 'r')
	except IOError as e:
	print(e, '\nError opening CMU cache file for reading: ' + CACHE)
	return None

	known = json.loads(f.read())
	f.close()
	return known


	def write_cache(known):
	try:
	f = open(CACHE, 'w')
	except IOError:
	print('Error opening CMU cache file for writing: ' + CACHE)
	return False

	json.dump(known, f)
	f.close()
	return True

	def write_other_cache(known):
	try:
	f = open('common_cache.json', 'w')
	except IOError:
	print('Error opening CMU cache file for writing: ')
	return False

	json.dump(known, f)
	f.close()
	return True


	def get_phonemes(fname, known):
	misses = 0
	hits = 0
	ignored = 0
	queries = 0
	to_send = ''

	f = open(fname, 'r')

	for l in f:
	if l.upper().strip('\n') == '':
	continue
	# Check to see if we already have this line in the cache
	if not l.upper().strip('\n') in known and not d.exclude(l.strip('\n')):
	if ')' not in l and '(' not in l:
	try:
	l.encode('ascii', 'replace')
	print('Cache miss: ' + l)
	to_send += l
	misses += 1
	except UnicodeDecodeError:
	# Got a weird character
	# print("Excluding entry:", l)
	ignored += 1
	else:
	# print("Excluding entry:", l)
	ignored += 1
	else:
	hits += 1

	if misses % d.CMU_QUERY_SIZE == 0 and misses != 0:
	resp = send_query(to_send)
	known = parse_response(resp, known)
	queries += 1
	misses += 1
	to_send = ''

	# One more query to finish off any stragglers
	if to_send != '':
	resp = send_query(to_send)
	known = parse_response(resp, known)
	queries += 1

	print('Submitted %d queries to the CMU service (%d hits, %d misses, %d ignored)' % (queries, hits, misses, ignored))
	f.close()
	return known


	def parse_response(resp, known):
	for line in resp.text.split('\n'):
	parts = line.split('\t')
	if len(parts) < 2:
	continue

	word = str(parts[0])
	if len(word) == 0:
	continue

	if parts[0][-1] == ')':
	word = parts[0].split('(')[0]

	known.setdefault(word, []).append([str(x.encode('ascii', 'replace')) for x in parts[1].split()])

	return known


	def send_query(to_send):
	cookies = {
	'__utma': '44984886.2147423455.1500652902.1500652902.1500652902.1',
	'__utmz': '44984886.1500652902.1.1.utmccn=(referral)\|utmcsr=google.com\|utmcct=/\|utmcmd=referral',
	'_ga': 'GA1.2.858928425.1500652902',
	'_gid': 'GA1.2.1514959829.1509572636',
	'__utmt': '1',
	'__utmb': '100617052.2.10.1509725212',
	'__utmc': '100617052',
	}

	headers = {
	'Origin': 'http://www.speech.cs.cmu.edu',
	'Accept-Encoding': 'gzip, deflate',
	'Accept-Language': 'en-US,en;q=0.9',
	'Upgrade-Insecure-Requests': '1',
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) '
	'Chrome/62.0.3202.75 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8',
	'Cache-Control': 'max-age=0',
	'Referer': 'http://www.speech.cs.cmu.edu/tools/lextool.html',
	'Connection': 'keep-alive',
	}

	file_stuff = {
	'wordfile': to_send,
	'handfile': ''
	}

	resp = requests.post('http://www.speech.cs.cmu.edu/cgi-bin/tools/logios/lextool.pl', headers=headers,
	cookies=cookies, files=file_stuff)
	dict_link = resp.text.find("DICT") + 5
	dict_link = resp.text[dict_link:resp.text.find('-->', dict_link) - 2]

	resp = requests.get(dict_link)

	return resp