Created
January 28, 2020 14:37
-
-
Save gracecarrillo/f1f56ad0a369781fd1030362e9b8f983 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------ FEATURE ENGINEERING ----------------# | |
#--- Part of Speech Tags (POS)--# | |
nltk.download('averaged_perceptron_tagger') | |
pos_family = { | |
'NOUN' : ['NN','NNS','NNP'], # Removed 'NNPS' | |
'PRON' : ['PRP','PRP$','WP','WP$'], | |
'VERB' : ['VB','VBD','VBG','VBN','VBP','VBZ'], | |
'ADJ' : ['JJ','JJR','JJS'], | |
'ADV' : ['RB','RBR','RBS','WRB'] | |
} | |
def count_pos_tag(tweets): | |
''' | |
Takes string of text to: | |
1. Processes text and attaches POS tags | |
2. Input the dictionary of POS tags into a Counter. | |
2. Returns list of POS tags with occurrence number ''' | |
total_count = [] | |
for s in tweets: | |
partial_count = {} | |
s = s.split() | |
count_pos = Counter(dict(nltk.pos_tag(s)).values()) | |
for item, value in count_pos.items(): | |
partial_count[item] = partial_count.get(item, 0) + 1 | |
total_count.append(partial_count) | |
return total_count | |
# Apply to your data | |
# Retrieve POS tags with occurrence | |
total_count = count_pos_tag(train.tidy_tweet.values) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment