Created
March 30, 2013 13:39
-
-
Save safehammad/5276713 to your computer and use it in GitHub Desktop.
Find "tonal" words in English, or more accurately, heteronyms i.e. two or more words with the same spelling but different pronunciation and meaning. More specifically, this script is interested in the stresses in a pronunciation and finds many "initial-stress-derived nouns" where verbs can be turned into nouns by stressing the first syllable e.g…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import cmudict | |
from nltk.corpus import wordnet | |
def is_multi_pos(word): | |
"""Return True if the given word can be used as both a noun and a verb. | |
With a small tweak, you can find words which can also be used as an adjective, or | |
any combination of the above. | |
""" | |
pos = [synset.pos for synset in wordnet.synsets(word) if synset.lemma_names[0] == word] | |
return 'n' in pos and 'v' in pos | |
def stresses(pronunciation): | |
"""Return just the stresses as a list of characters for each part of a pronunciation where: | |
0 - No stress | |
1 - Primary stress | |
2 - Secondary stress | |
For example, the pronunciation for 'python': ['P', 'AY1', 'TH', 'AA0', 'N'] returns ['1', '0'] | |
""" | |
return [i[-1] for i in pronunciation if i[-1].isdigit()] | |
def is_tonal(pronunciation): | |
"""Return True for 2+ syllable words where there are 2+ ways to stress the pronunciation.""" | |
stress_set = set(''.join(stresses(i)) for i in pronunciation) | |
syllable_set = set(map(len, stress_set)) | |
return len(stress_set) > 1 and min(syllable_set) > 1 | |
def get_tonal_words(): | |
"""Return a list of "tonal" English words i.e. heteronyms.""" | |
return sorted(word for word, pronunciation in cmudict.dict().iteritems() if is_tonal(pronunciation) and is_multi_pos(word)) | |
if __name__ == '__main__': | |
print get_tonal_words() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment