Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
def create_word_embedding(comments, add_pos_tags=False):
:param comments: List of lists containing all the comments to do word embedding
:param add_pos_tags: Flag to add parts-of-speech tags to the comment
:return encoded_comments: Comments in a vectorized list of lists.
count = 0
word_embedding = {}
encoded_comments = []
for comment in comments:
# Segment sentence(s) to a list: [ "this", "is", "a", "sentence", "." ]
# Normalize comment by converting to lowercase, for later mapping
comment = nltk.word_tokenize(comment.lower())
# Create a POS sentence: [ "word", "POS_tag", "word", "POS_tag", ... ]
if add_pos_tags:
comment = [ele for word_tuple in nltk.pos_tag(comment) for ele in word_tuple]
# Creating mapping: { "this": 1, "is": 2, ... } & encode each comment
encoded_comment = []
for word in comment:
if word not in word_embedding:
word_embedding[word] = count
count += 1
return encoded_comments
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment