bluedistro/code_1.py

## code_1.py
# cut off reviews after 500 words
max_len = 500
# train on 10000 samples
training_samples = 10000
 # validate on 10000 samples
validation_samples = 10000
# consider only the top 10000 words
max_words = 10000

# import tokenizer with the consideration for only the top 500 words
tokenizer = Tokenizer(num_words=max_words)
# fit the tokenizer on the texts
tokenizer.fit_on_texts(texts)
# convert the texts to sequences
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens. ' % len(word_index))

 # pad the sequence to the required length to ensure uniformity
data = pad_sequences(sequences, maxlen=max_len)
print('Data Shape: {}'.format(data.shape))

labels = np.asarray(labels)
print("Shape of data tensor: ", data.shape)
print("Shape of label tensor: ", labels.shape)

# split the data into training and validation set but before that shuffle it first
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples:training_samples + validation_samples]
y_val = labels[training_samples:training_samples + validation_samples]

# test_data
x_test = data[training_samples+validation_samples:]
y_test = labels[training_samples+validation_samples:]
	# cut off reviews after 500 words
	max_len = 500
	# train on 10000 samples
	training_samples = 10000
	# validate on 10000 samples
	validation_samples = 10000
	# consider only the top 10000 words
	max_words = 10000

	# import tokenizer with the consideration for only the top 500 words
	tokenizer = Tokenizer(num_words=max_words)
	# fit the tokenizer on the texts
	tokenizer.fit_on_texts(texts)
	# convert the texts to sequences
	sequences = tokenizer.texts_to_sequences(texts)

	word_index = tokenizer.word_index
	print('Found %s unique tokens. ' % len(word_index))

	# pad the sequence to the required length to ensure uniformity
	data = pad_sequences(sequences, maxlen=max_len)
	print('Data Shape: {}'.format(data.shape))

	labels = np.asarray(labels)
	print("Shape of data tensor: ", data.shape)
	print("Shape of label tensor: ", labels.shape)

	# split the data into training and validation set but before that shuffle it first
	indices = np.arange(data.shape[0])
	np.random.shuffle(indices)
	data = data[indices]
	labels = labels[indices]

	x_train = data[:training_samples]
	y_train = labels[:training_samples]
	x_val = data[training_samples:training_samples + validation_samples]
	y_val = labels[training_samples:training_samples + validation_samples]

	# test_data
	x_test = data[training_samples+validation_samples:]
	y_test = labels[training_samples+validation_samples:]