Last active
December 28, 2018 05:26
-
-
Save lettergram/60e96a4291d60c175c19faef54ed0340 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def create_word_embedding(comments, add_pos_tags=False): | |
''' | |
:param comments: List of lists containing all the comments to do word embedding | |
:param add_pos_tags: Flag to add parts-of-speech tags to the comment | |
:return encoded_comments: Comments in a vectorized list of lists. | |
''' | |
count = 0 | |
word_embedding = {} | |
encoded_comments = [] | |
for comment in comments: | |
# Segment sentence(s) to a list: [ "this", "is", "a", "sentence", "." ] | |
# Normalize comment by converting to lowercase, for later mapping | |
comment = nltk.word_tokenize(comment.lower()) | |
# Create a POS sentence: [ "word", "POS_tag", "word", "POS_tag", ... ] | |
if add_pos_tags: | |
comment = [ele for word_tuple in nltk.pos_tag(comment) for ele in word_tuple] | |
# Creating mapping: { "this": 1, "is": 2, ... } & encode each comment | |
encoded_comment = [] | |
for word in comment: | |
if word not in word_embedding: | |
word_embedding[word] = count | |
count += 1 | |
encoded_comment.append(word_embedding[word]) | |
encoded_comments.append(encoded_comment) | |
return encoded_comments |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment