hasyimibhar/homophones.py

## homophones.py
import json
from wordfreq import word_frequency

# Source: https://github.com/AllenDowney/ThinkPython2/blob/master/code/pronounce.py
def read_dictionary(filename='c06d'):
    """Reads from a file and builds a dictionary that maps from
    each word to a string that describes its primary pronunciation.
    Secondary pronunciations are added to the dictionary with
    a number, in parentheses, at the end of the key, so the
    key for the second pronunciation of "abdominal" is "abdominal(2)".
    filename: string
    returns: map from string to pronunciation
    """
    d = dict()
    fin = open(filename)
    for line in fin:

        # skip over the comments
        if line[0] == '#': continue

        t = line.split()
        word = t[0].lower()
        pron = ' '.join(t[1:])
        d[word] = pron

    return d

word_pron = read_dictionary()
pron_word = {}

for word, pron in word_pron.items():
    # The dictionary contains words with non-alphabets, so skip them
    if not word.isalpha():
        continue

    if pron not in pron_word:
        pron_word[pron] = {
            'max_freq': 0,
            'words': [],
        }

    pron_word[pron]['words'].append(word)
    freq = word_frequency(word, 'en')
    if freq > pron_word[pron]['max_freq']:
        pron_word[pron]['max_freq'] = freq

# Filter out entries with only 1 word
pron_word = {key: value for (key, value) in pron_word.items() if len(value['words']) > 1 }

homophones = []

for pron, entry in pron_word.items():
    entry['pron'] = pron
    homophones.append(entry)

homophones = sorted(homophones, key=lambda e: e['max_freq'], reverse=True)

with open("homophones.json", "w") as f:
    f.write(json.dumps(homophones, indent=2))
	import json
	from wordfreq import word_frequency

	# Source: https://github.com/AllenDowney/ThinkPython2/blob/master/code/pronounce.py
	def read_dictionary(filename='c06d'):
	"""Reads from a file and builds a dictionary that maps from
	each word to a string that describes its primary pronunciation.
	Secondary pronunciations are added to the dictionary with
	a number, in parentheses, at the end of the key, so the
	key for the second pronunciation of "abdominal" is "abdominal(2)".
	filename: string
	returns: map from string to pronunciation
	"""
	d = dict()
	fin = open(filename)
	for line in fin:

	# skip over the comments
	if line[0] == '#': continue

	t = line.split()
	word = t[0].lower()
	pron = ' '.join(t[1:])
	d[word] = pron

	return d

	word_pron = read_dictionary()
	pron_word = {}

	for word, pron in word_pron.items():
	# The dictionary contains words with non-alphabets, so skip them
	if not word.isalpha():
	continue

	if pron not in pron_word:
	pron_word[pron] = {
	'max_freq': 0,
	'words': [],
	}

	pron_word[pron]['words'].append(word)
	freq = word_frequency(word, 'en')
	if freq > pron_word[pron]['max_freq']:
	pron_word[pron]['max_freq'] = freq

	# Filter out entries with only 1 word
	pron_word = {key: value for (key, value) in pron_word.items() if len(value['words']) > 1 }

	homophones = []

	for pron, entry in pron_word.items():
	entry['pron'] = pron
	homophones.append(entry)

	homophones = sorted(homophones, key=lambda e: e['max_freq'], reverse=True)

	with open("homophones.json", "w") as f:
	f.write(json.dumps(homophones, indent=2))