Last active
December 31, 2018 15:09
-
-
Save bluedistro/10fc4ba945b906cc30fa3f752a924378 to your computer and use it in GitHub Desktop.
tokenization
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# cut off reviews after 500 words | |
max_len = 500 | |
# train on 10000 samples | |
training_samples = 10000 | |
# validate on 10000 samples | |
validation_samples = 10000 | |
# consider only the top 10000 words | |
max_words = 10000 | |
# import tokenizer with the consideration for only the top 500 words | |
tokenizer = Tokenizer(num_words=max_words) | |
# fit the tokenizer on the texts | |
tokenizer.fit_on_texts(texts) | |
# convert the texts to sequences | |
sequences = tokenizer.texts_to_sequences(texts) | |
word_index = tokenizer.word_index | |
print('Found %s unique tokens. ' % len(word_index)) | |
# pad the sequence to the required length to ensure uniformity | |
data = pad_sequences(sequences, maxlen=max_len) | |
print('Data Shape: {}'.format(data.shape)) | |
labels = np.asarray(labels) | |
print("Shape of data tensor: ", data.shape) | |
print("Shape of label tensor: ", labels.shape) | |
# split the data into training and validation set but before that shuffle it first | |
indices = np.arange(data.shape[0]) | |
np.random.shuffle(indices) | |
data = data[indices] | |
labels = labels[indices] | |
x_train = data[:training_samples] | |
y_train = labels[:training_samples] | |
x_val = data[training_samples:training_samples + validation_samples] | |
y_val = labels[training_samples:training_samples + validation_samples] | |
# test_data | |
x_test = data[training_samples+validation_samples:] | |
y_test = labels[training_samples+validation_samples:] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment