lettergram/encode_data.py

## encode_data.py
def encode_and_split_data(comments, categories, data_split=0.8):
  '''
  :param comments: List of lists containing all comments
  :param categories: List containing labeled categories for associated comments
  :param data_split: The ratio of training to testing data (typical 80/20 split)
  :return x_train: Numpy array of encoded training sample(s) (comment)
  :return x_test: Numpy array of encoded testing sample(s) (comment)
  :return y_train: Numpy array of encoded training label (category)
  :return y_test: Numpy array of encoded testing label (category)
  '''

  # Word + Punctuation + POS Tags embedding
  encoded_comments = create_word_embedding(comments, add_pos_tags=True)

  # Word embedding, ensure you don't add the POS tags
  encoded_categories = create_word_embedding(categories, add_pos_tags=False)

  # Determine the training sample split point
  training_sample = int(len(encoded_comments) * data_split)

  # Split the dataset into training vs testing datasets
  x_train = np.array(encoded_comments[:training_sample])
  x_test  = np.array(encoded_comments[training_sample:])
  y_train = np.array(encoded_categories[:training_sample])
  y_test  = np.array(encoded_categories[training_sample:])

  return x_train, x_test, y_train, y_test
	def encode_and_split_data(comments, categories, data_split=0.8):
	'''
	:param comments: List of lists containing all comments
	:param categories: List containing labeled categories for associated comments
	:param data_split: The ratio of training to testing data (typical 80/20 split)
	:return x_train: Numpy array of encoded training sample(s) (comment)
	:return x_test: Numpy array of encoded testing sample(s) (comment)
	:return y_train: Numpy array of encoded training label (category)
	:return y_test: Numpy array of encoded testing label (category)
	'''

	# Word + Punctuation + POS Tags embedding
	encoded_comments = create_word_embedding(comments, add_pos_tags=True)

	# Word embedding, ensure you don't add the POS tags
	encoded_categories = create_word_embedding(categories, add_pos_tags=False)

	# Determine the training sample split point
	training_sample = int(len(encoded_comments) * data_split)

	# Split the dataset into training vs testing datasets
	x_train = np.array(encoded_comments[:training_sample])
	x_test = np.array(encoded_comments[training_sample:])
	y_train = np.array(encoded_categories[:training_sample])
	y_test = np.array(encoded_categories[training_sample:])

	return x_train, x_test, y_train, y_test