Last active
January 23, 2022 18:13
-
-
Save hasyimibhar/53d9ac94b5bbb15ad83fd112f81b470a to your computer and use it in GitHub Desktop.
Search for homophones, and sort them by decreasing max word frequency
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from wordfreq import word_frequency | |
# Source: https://github.com/AllenDowney/ThinkPython2/blob/master/code/pronounce.py | |
def read_dictionary(filename='c06d'): | |
"""Reads from a file and builds a dictionary that maps from | |
each word to a string that describes its primary pronunciation. | |
Secondary pronunciations are added to the dictionary with | |
a number, in parentheses, at the end of the key, so the | |
key for the second pronunciation of "abdominal" is "abdominal(2)". | |
filename: string | |
returns: map from string to pronunciation | |
""" | |
d = dict() | |
fin = open(filename) | |
for line in fin: | |
# skip over the comments | |
if line[0] == '#': continue | |
t = line.split() | |
word = t[0].lower() | |
pron = ' '.join(t[1:]) | |
d[word] = pron | |
return d | |
word_pron = read_dictionary() | |
pron_word = {} | |
for word, pron in word_pron.items(): | |
# The dictionary contains words with non-alphabets, so skip them | |
if not word.isalpha(): | |
continue | |
if pron not in pron_word: | |
pron_word[pron] = { | |
'max_freq': 0, | |
'words': [], | |
} | |
pron_word[pron]['words'].append(word) | |
freq = word_frequency(word, 'en') | |
if freq > pron_word[pron]['max_freq']: | |
pron_word[pron]['max_freq'] = freq | |
# Filter out entries with only 1 word | |
pron_word = {key: value for (key, value) in pron_word.items() if len(value['words']) > 1 } | |
homophones = [] | |
for pron, entry in pron_word.items(): | |
entry['pron'] = pron | |
homophones.append(entry) | |
homophones = sorted(homophones, key=lambda e: e['max_freq'], reverse=True) | |
with open("homophones.json", "w") as f: | |
f.write(json.dumps(homophones, indent=2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The file
c06d
can be obtained here: https://github.com/AllenDowney/ThinkPython2/blob/master/code/c06d