Skip to content

Instantly share code, notes, and snippets.

@louismagowan
Last active June 7, 2022 14:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save louismagowan/a03f8bf372de26efebcda8f1bc221afb to your computer and use it in GitHub Desktop.
Save louismagowan/a03f8bf372de26efebcda8f1bc221afb to your computer and use it in GitHub Desktop.
# Imports
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
def tokenize_padder(train_text, test_text,
chars_to_filter = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
oov_token = "OOV",
maxlen = 500,
padding = "pre",
truncating = "post"
):
# Create tokenizer
tokenizer = Tokenizer(filters = chars_to_filter,
oov_token = oov_token)
# Fit tokenizer on training data only
tokenizer.fit_on_texts(train_text)
# Generate sequences
train_sequences = tokenizer.texts_to_sequences(train_text)
test_sequences = tokenizer.texts_to_sequences(test_text)
# Pad and trim sequences
# Pre-padding is empirically better for sequence modelling
# Post-truncating ensures the titles are included in observations
train_padded = pad_sequences(train_sequences, maxlen = maxlen, padding = padding, truncating = truncating)
test_padded = pad_sequences(test_sequences, maxlen = maxlen, padding = padding, truncating = truncating)
return tokenizer, train_padded, test_padded
# Split into test and train data
X = df.all_text.values
y = np.array(df["fake_news"], dtype = "float32")
text_train, text_test, y_train, y_test = train_test_split(X, y,
test_size = 0.2, shuffle = True,
# reproducible split
random_state = 1)
# Process, tokenize, pad/trim
tokenizer, X_train, X_test = tokenize_padder(text_train, text_test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment