lettergram/word_embedding.py

## word_embedding.py
def create_word_embedding(comments, add_pos_tags=False):
  '''
  :param comments: List of lists containing all the comments to do word embedding
  :param add_pos_tags: Flag to add parts-of-speech tags to the comment
  :return encoded_comments: Comments in a vectorized list of lists.
  '''

  count = 0
  word_embedding = {}
  encoded_comments = []

  for comment in comments:
    # Segment sentence(s) to a list: [ "this", "is", "a", "sentence", "." ]
    # Normalize comment by converting to lowercase, for later mapping
    comment = nltk.word_tokenize(comment.lower())

    # Create a POS sentence: [ "word", "POS_tag", "word", "POS_tag", ... ]
    if add_pos_tags:
      comment = [ele for word_tuple in nltk.pos_tag(comment) for ele in word_tuple]

    # Creating mapping: { "this": 1, "is": 2, ... } & encode each comment
    encoded_comment = []
    for word in comment:
      if word not in word_embedding:
        word_embedding[word] = count
        count += 1
      encoded_comment.append(word_embedding[word])
    encoded_comments.append(encoded_comment)

return encoded_comments
	def create_word_embedding(comments, add_pos_tags=False):
	'''
	:param comments: List of lists containing all the comments to do word embedding
	:param add_pos_tags: Flag to add parts-of-speech tags to the comment
	:return encoded_comments: Comments in a vectorized list of lists.
	'''

	count = 0
	word_embedding = {}
	encoded_comments = []

	for comment in comments:
	# Segment sentence(s) to a list: [ "this", "is", "a", "sentence", "." ]
	# Normalize comment by converting to lowercase, for later mapping
	comment = nltk.word_tokenize(comment.lower())

	# Create a POS sentence: [ "word", "POS_tag", "word", "POS_tag", ... ]
	if add_pos_tags:
	comment = [ele for word_tuple in nltk.pos_tag(comment) for ele in word_tuple]

	# Creating mapping: { "this": 1, "is": 2, ... } & encode each comment
	encoded_comment = []
	for word in comment:
	if word not in word_embedding:
	word_embedding[word] = count
	count += 1
	encoded_comment.append(word_embedding[word])
	encoded_comments.append(encoded_comment)

	return encoded_comments