Skip to content

Instantly share code, notes, and snippets.

@hasyimibhar
Last active January 23, 2022 18:13
Show Gist options
  • Save hasyimibhar/53d9ac94b5bbb15ad83fd112f81b470a to your computer and use it in GitHub Desktop.
Save hasyimibhar/53d9ac94b5bbb15ad83fd112f81b470a to your computer and use it in GitHub Desktop.
Search for homophones, and sort them by decreasing max word frequency
import json
from wordfreq import word_frequency
# Source: https://github.com/AllenDowney/ThinkPython2/blob/master/code/pronounce.py
def read_dictionary(filename='c06d'):
"""Reads from a file and builds a dictionary that maps from
each word to a string that describes its primary pronunciation.
Secondary pronunciations are added to the dictionary with
a number, in parentheses, at the end of the key, so the
key for the second pronunciation of "abdominal" is "abdominal(2)".
filename: string
returns: map from string to pronunciation
"""
d = dict()
fin = open(filename)
for line in fin:
# skip over the comments
if line[0] == '#': continue
t = line.split()
word = t[0].lower()
pron = ' '.join(t[1:])
d[word] = pron
return d
word_pron = read_dictionary()
pron_word = {}
for word, pron in word_pron.items():
# The dictionary contains words with non-alphabets, so skip them
if not word.isalpha():
continue
if pron not in pron_word:
pron_word[pron] = {
'max_freq': 0,
'words': [],
}
pron_word[pron]['words'].append(word)
freq = word_frequency(word, 'en')
if freq > pron_word[pron]['max_freq']:
pron_word[pron]['max_freq'] = freq
# Filter out entries with only 1 word
pron_word = {key: value for (key, value) in pron_word.items() if len(value['words']) > 1 }
homophones = []
for pron, entry in pron_word.items():
entry['pron'] = pron
homophones.append(entry)
homophones = sorted(homophones, key=lambda e: e['max_freq'], reverse=True)
with open("homophones.json", "w") as f:
f.write(json.dumps(homophones, indent=2))
@hasyimibhar
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment