Skip to content

Instantly share code, notes, and snippets.

@jss367
Created September 23, 2017 07:39
Show Gist options
  • Save jss367/dac7dc15737e5afe566e333eeaf214c4 to your computer and use it in GitHub Desktop.
Save jss367/dac7dc15737e5afe566e333eeaf214c4 to your computer and use it in GitHub Desktop.
# Let's make a single function to determine the parts of speech
import re
import nltk
import os
#from collections import Counter # Is this used?
# First we break the text into tokens
def tokinze_text(raw_text):
tokens = nltk.word_tokenize(raw_text)
return tokens
tokens = tokinze_text(text)
def mytagger(tokens):
'''This function inputs tokens'''
tags = nltk.pos_tag(tokens)
return tags
tagged = mytagger(tokens)
# Note that IN can be either a preposition or a conjunction, for now we're going to list it with the prepositions
common_noun_pos = ['NN', 'NNS']
common_nouns = []
verb_pos = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
verbs=[]
adjective_pos = ['JJ', 'JJR', 'JJS']
adjectives = []
pronoun_pos = ['PRP', 'PRP$', 'WP', 'WP$']
pronouns = []
adverb_pos = ['RB', 'RBR', 'RBS', 'WRB']
adverbs = []
proper_noun_pos = ['NNP', 'NNPS']
proper_nouns = []
conjunction_pos = ['CC']
conjunctions = []
preposition_pos = ['IN', 'TO']
prepositions = []
interjection_pos = ['UH']
interjections = []
modal_pos = ['MD'] # But these are also verbs, so let's make sure they show up as such
modals = []
tagged_other_pos = ['CD', 'DT', 'EX', 'FW', 'LS', 'PDT', 'POS', 'RP', 'SYM', 'WDT']
tagged_others = []
other = []
for idx, token in enumerate(tagged):
if token[1] in common_noun_pos:
common_nouns.append(token)
elif token[1] in verb_pos:
verbs.append(token)
elif token[1] in adjective_pos:
adjectives.append(token)
elif token[1] in pronoun_pos:
pronouns.append(token)
elif token[1] in adverb_pos:
adverbs.append(token)
elif token[1] in proper_noun_pos:
proper_nouns.append(token)
elif token[1] in conjunction_pos:
conjunctions.append(token)
elif token[1] in preposition_pos:
prepositions.append(token)
elif token[1] in interjection_pos:
interjections.append(token)
elif token[1] in modal_pos:
modals.append(token)
elif token[1] in tagged_other_pos:
tagged_others.append(token)
else:
other.append(token)
parts_of_speech = [common_nouns, verbs, adjectives, pronouns, adverbs, proper_nouns, conjunctions, prepositions, interjections, modals]
# Apped modals to verbs
# Create nouns that is both proper nouns and common nouns
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment