This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Compile model on adam optimizer, binary_crossentropy loss, and accuracy metrics | |
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy']) | |
#Train model on 5 epochs | |
model.fit(title_train,y_train,epochs=5) | |
#Evaluate model on test data | |
model.evaluate(title_test,y_test) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
# Input Layers | |
input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name='news') | |
# BERT layers | |
processed = bert_preprocess(input_layer) | |
output = bert_encoder(processed) | |
# Fully Connected Layers |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!pip install tensorflow-text | |
import tensorflow_hub as hub | |
import tensorflow_text as text | |
#Use the bert preprocesser and bert encoder from tensorflow_hub | |
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3") | |
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Sequential model has a 50 cell LSTM layer before Dense layers | |
model = tf.keras.models.Sequential() | |
model.add(tf.keras.layers.LSTM(50)) | |
model.add(tf.keras.layers.Dense(20,activation='relu')) | |
model.add(tf.keras.layers.Dense(5,activation='relu')) | |
model.add(tf.keras.layers.Dense(1,activation='sigmoid')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Convert each of the testing data series to a Word2Vec embedding | |
test = [] | |
for i in title_test: | |
temp = np.array(embed(i)) | |
test.append(temp) | |
#Accounts for the different length of words in test data | |
test = tf.keras.preprocessing.sequence.pad_sequences(test,dtype='float') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import train_test_split | |
#Split data into training and testing dataset | |
title_train, title_test, y_train, y_test = train_test_split(titles, labels, test_size=0.2,\ | |
random_state=1000) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow_hub as hub | |
import tensorflow as tf | |
embed = hub.load("https://tfhub.dev/google/Wiki-words-250/2") | |
#Convert each series of words to a word2vec embedding | |
indiv = [] | |
for i in title_train: | |
temp = np.array(embed(i)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import nltk preprocessing library to convert text into a readable format | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import stopwords | |
nltk.download('punkt') | |
nltk.download('wordnet') | |
nltk.download('stopwords') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
#Lowercase letters | |
data['title'] = data['title'].str.lower() | |
data.head() | |
#Ensure that all necessary punctuations are in one list | |
#Include ' and " as they are not default | |
punc = list(string.punctuation) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Character Length of Titles - Min, Mean, Max | |
print('Mean Length', data['title'].apply(len).mean()) | |
print('Min Length', data['title'].apply(len).min()) | |
print('Max Length', data['title'].apply(len).max()) | |
#plotting the frequency of characters on a histogram | |
import seaborn as sns | |
x = data['title'].apply(len).plot.hist() |
NewerOlder