louismagowan/fake_news-tokenizer.py Secret

## fake_news-tokenizer.py
# Imports
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np

def tokenize_padder(train_text, test_text,
                   chars_to_filter = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                    oov_token = "OOV",
                    maxlen = 500,
                    padding = "pre",
                    truncating = "post"
                   ):
    # Create tokenizer
    tokenizer = Tokenizer(filters = chars_to_filter,
                          oov_token = oov_token)

    # Fit tokenizer on training data only
    tokenizer.fit_on_texts(train_text)

    # Generate sequences
    train_sequences = tokenizer.texts_to_sequences(train_text)
    test_sequences = tokenizer.texts_to_sequences(test_text)

    # Pad and trim sequences
    # Pre-padding is empirically better for sequence modelling
    # Post-truncating ensures the titles are included in observations
    train_padded = pad_sequences(train_sequences, maxlen = maxlen, padding = padding, truncating = truncating)
    test_padded = pad_sequences(test_sequences, maxlen = maxlen, padding = padding, truncating = truncating)

    return tokenizer, train_padded, test_padded

# Split into test and train data
X = df.all_text.values
y = np.array(df["fake_news"], dtype = "float32")
text_train, text_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2, shuffle = True,
                                                          # reproducible split
                                                         random_state = 1)
# Process, tokenize, pad/trim
tokenizer, X_train, X_test = tokenize_padder(text_train, text_test)
	# Imports
	from keras.preprocessing.text import Tokenizer
	from keras.preprocessing.sequence import pad_sequences
	from sklearn.model_selection import train_test_split
	import numpy as np

	def tokenize_padder(train_text, test_text,
	chars_to_filter = '!"#$%&()*+,-./:;<=>?@[\\]^_`{\|}~\t\n',
	oov_token = "OOV",
	maxlen = 500,
	padding = "pre",
	truncating = "post"
	):
	# Create tokenizer
	tokenizer = Tokenizer(filters = chars_to_filter,
	oov_token = oov_token)

	# Fit tokenizer on training data only
	tokenizer.fit_on_texts(train_text)

	# Generate sequences
	train_sequences = tokenizer.texts_to_sequences(train_text)
	test_sequences = tokenizer.texts_to_sequences(test_text)

	# Pad and trim sequences
	# Pre-padding is empirically better for sequence modelling
	# Post-truncating ensures the titles are included in observations
	train_padded = pad_sequences(train_sequences, maxlen = maxlen, padding = padding, truncating = truncating)
	test_padded = pad_sequences(test_sequences, maxlen = maxlen, padding = padding, truncating = truncating)

	return tokenizer, train_padded, test_padded

	# Split into test and train data
	X = df.all_text.values
	y = np.array(df["fake_news"], dtype = "float32")
	text_train, text_test, y_train, y_test = train_test_split(X, y,
	test_size = 0.2, shuffle = True,
	# reproducible split
	random_state = 1)
	# Process, tokenize, pad/trim
	tokenizer, X_train, X_test = tokenize_padder(text_train, text_test)