Skip to content

Instantly share code, notes, and snippets.

View bertCompile.py
#Compile model on adam optimizer, binary_crossentropy loss, and accuracy metrics
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
#Train model on 5 epochs
model.fit(title_train,y_train,epochs=5)
#Evaluate model on test data
model.evaluate(title_test,y_test)
View bertModel.py
import tensorflow as tf
# Input Layers
input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name='news')
# BERT layers
processed = bert_preprocess(input_layer)
output = bert_encoder(processed)
# Fully Connected Layers
View bertImport.py
!pip install tensorflow-text
import tensorflow_hub as hub
import tensorflow_text as text
#Use the bert preprocesser and bert encoder from tensorflow_hub
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')
View lstm.py
#Sequential model has a 50 cell LSTM layer before Dense layers
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(50))
model.add(tf.keras.layers.Dense(20,activation='relu'))
model.add(tf.keras.layers.Dense(5,activation='relu'))
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))
View testData.py
#Convert each of the testing data series to a Word2Vec embedding
test = []
for i in title_test:
temp = np.array(embed(i))
test.append(temp)
#Accounts for the different length of words in test data
test = tf.keras.preprocessing.sequence.pad_sequences(test,dtype='float')
View traintest.py
from sklearn.model_selection import train_test_split
#Split data into training and testing dataset
title_train, title_test, y_train, y_test = train_test_split(titles, labels, test_size=0.2,\
random_state=1000)
View wiki250.py
import tensorflow_hub as hub
import tensorflow as tf
embed = hub.load("https://tfhub.dev/google/Wiki-words-250/2")
#Convert each series of words to a word2vec embedding
indiv = []
for i in title_train:
temp = np.array(embed(i))
View nltk.py
#Import nltk preprocessing library to convert text into a readable format
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
View string.py
import string
#Lowercase letters
data['title'] = data['title'].str.lower()
data.head()
#Ensure that all necessary punctuations are in one list
#Include ' and " as they are not default
punc = list(string.punctuation)
View eda.py
# Character Length of Titles - Min, Mean, Max
print('Mean Length', data['title'].apply(len).mean())
print('Min Length', data['title'].apply(len).min())
print('Max Length', data['title'].apply(len).max())
#plotting the frequency of characters on a histogram
import seaborn as sns
x = data['title'].apply(len).plot.hist()