Created
December 22, 2011 15:23
-
-
Save sebastiangeiger/1510674 to your computer and use it in GitHub Desktop.
NLTK homework
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import locale | |
import nltk | |
from nltk.corpus import wordnet as wn | |
from nltk.tag.simplify import simplify_wsj_tag | |
def main(): | |
string = "I saw a man who is 98 years old and can still walk and tell jokes." | |
tokens = nltk.word_tokenize(string) | |
print nice_formatting(combinations_without_wordtype(tokens)) + " possible combinations when not taking wordtype into account" | |
print nice_formatting(combinations_with_wordtype(tokens)) + " possible combinations when taking wordtype into account" | |
def combinations_without_wordtype(tokens): | |
without_wordtype = 1; | |
for token in tokens: | |
number_of_synsets = len(wn.synsets(token)) | |
# print str(number_of_synsets) + " x "+ token | |
without_wordtype *= max(1,number_of_synsets) | |
return without_wordtype | |
def combinations_with_wordtype(tokens): | |
with_wordtype = 1; | |
for token,word_class in nltk.pos_tag(tokens): | |
translatable, word_class = translate_to_wordnet_tags(word_class) | |
if translatable : | |
number_of_synsets = len(wn.synsets(token,word_class)) | |
else: | |
number_of_synsets = 1 | |
# print str(number_of_synsets) + " x "+ token + "(" + word_class + ")" | |
with_wordtype *= max(1,number_of_synsets) | |
return with_wordtype | |
def translate_to_wordnet_tags(parser_tag): | |
parser_tag = simplify_wsj_tag(parser_tag) | |
if(parser_tag=='N'): | |
return True, wn.NOUN | |
elif (parser_tag=='V' or parser_tag=='VD' or parser_tag=='VG' or parser_tag=='VN' or parser_tag=='MOD'): | |
return True, wn.VERB | |
elif (parser_tag=='ADJ'): | |
return True, wn.ADJ | |
elif (parser_tag=='ADV'): | |
return True, wn.ADV | |
elif (parser_tag=='.'): | |
return False, '.' | |
return False, "Unknown tag: "+parser_tag | |
def nice_formatting(integer): | |
locale.setlocale(locale.LC_ALL, 'en_US') | |
string = locale.format("%d", integer, grouping=True) | |
return string | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment