Skip to content

Instantly share code, notes, and snippets.

Created March 30, 2013 13:39
Show Gist options
  • Save safehammad/5276713 to your computer and use it in GitHub Desktop.
Save safehammad/5276713 to your computer and use it in GitHub Desktop.
Find "tonal" words in English, or more accurately, heteronyms i.e. two or more words with the same spelling but different pronunciation and meaning. More specifically, this script is interested in the stresses in a pronunciation and finds many "initial-stress-derived nouns" where verbs can be turned into nouns by stressing the first syllable e.g…
from nltk.corpus import cmudict
from nltk.corpus import wordnet
def is_multi_pos(word):
"""Return True if the given word can be used as both a noun and a verb.
With a small tweak, you can find words which can also be used as an adjective, or
any combination of the above.
pos = [synset.pos for synset in wordnet.synsets(word) if synset.lemma_names[0] == word]
return 'n' in pos and 'v' in pos
def stresses(pronunciation):
"""Return just the stresses as a list of characters for each part of a pronunciation where:
0 - No stress
1 - Primary stress
2 - Secondary stress
For example, the pronunciation for 'python': ['P', 'AY1', 'TH', 'AA0', 'N'] returns ['1', '0']
return [i[-1] for i in pronunciation if i[-1].isdigit()]
def is_tonal(pronunciation):
"""Return True for 2+ syllable words where there are 2+ ways to stress the pronunciation."""
stress_set = set(''.join(stresses(i)) for i in pronunciation)
syllable_set = set(map(len, stress_set))
return len(stress_set) > 1 and min(syllable_set) > 1
def get_tonal_words():
"""Return a list of "tonal" English words i.e. heteronyms."""
return sorted(word for word, pronunciation in cmudict.dict().iteritems() if is_tonal(pronunciation) and is_multi_pos(word))
if __name__ == '__main__':
print get_tonal_words()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment