Shaya Farahmand shayaf84

## bertCompile.py
#Compile model on adam optimizer, binary_crossentropy loss, and accuracy metrics
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
#Train model on 5 epochs
model.fit(title_train,y_train,epochs=5)

#Evaluate model on test data
model.evaluate(title_test,y_test)

## bertModel.py
import tensorflow as tf

# Input Layers
input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name='news')

# BERT layers
processed = bert_preprocess(input_layer)
output = bert_encoder(processed)

# Fully Connected Layers

## bertImport.py
!pip install tensorflow-text
import tensorflow_hub as hub
import tensorflow_text as text

#Use the bert preprocesser and bert encoder from tensorflow_hub
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

## lstm.py
#Sequential model has a 50 cell LSTM layer before Dense layers
model = tf.keras.models.Sequential()


model.add(tf.keras.layers.LSTM(50))
model.add(tf.keras.layers.Dense(20,activation='relu'))
model.add(tf.keras.layers.Dense(5,activation='relu'))

model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

## testData.py
#Convert each of the testing data series to a Word2Vec embedding
test = []

for i in title_test:
  temp = np.array(embed(i))
  test.append(temp)

#Accounts for the different length of words in test data
test = tf.keras.preprocessing.sequence.pad_sequences(test,dtype='float')

## traintest.py
from sklearn.model_selection import train_test_split

#Split data into training and testing dataset
title_train, title_test, y_train, y_test = train_test_split(titles, labels, test_size=0.2,\
                                                            random_state=1000)

## wiki250.py
import tensorflow_hub as hub
import tensorflow as tf

embed = hub.load("https://tfhub.dev/google/Wiki-words-250/2")

#Convert each series of words to a word2vec embedding
indiv = []
for i in title_train:
  temp = np.array(embed(i))


## nltk.py
#Import nltk preprocessing library to convert text into a readable format
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

## string.py
import string

#Lowercase letters
data['title'] = data['title'].str.lower()
data.head()


#Ensure that all necessary punctuations are in one list
#Include ' and " as they are not default
punc = list(string.punctuation)

## eda.py
# Character Length of Titles - Min, Mean, Max
print('Mean Length', data['title'].apply(len).mean())
print('Min Length', data['title'].apply(len).min())
print('Max Length', data['title'].apply(len).max())

#plotting the frequency of characters on a histogram
import seaborn as sns

x = data['title'].apply(len).plot.hist()
	#Compile model on adam optimizer, binary_crossentropy loss, and accuracy metrics
	model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
	#Train model on 5 epochs
	model.fit(title_train,y_train,epochs=5)

	#Evaluate model on test data
	model.evaluate(title_test,y_test)
	import tensorflow as tf

	# Input Layers
	input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name='news')

	# BERT layers
	processed = bert_preprocess(input_layer)
	output = bert_encoder(processed)

	# Fully Connected Layers
	!pip install tensorflow-text
	import tensorflow_hub as hub
	import tensorflow_text as text

	#Use the bert preprocesser and bert encoder from tensorflow_hub
	bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
	bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')
	#Sequential model has a 50 cell LSTM layer before Dense layers
	model = tf.keras.models.Sequential()


	model.add(tf.keras.layers.LSTM(50))
	model.add(tf.keras.layers.Dense(20,activation='relu'))
	model.add(tf.keras.layers.Dense(5,activation='relu'))

	model.add(tf.keras.layers.Dense(1,activation='sigmoid'))
	#Convert each of the testing data series to a Word2Vec embedding
	test = []

	for i in title_test:
	temp = np.array(embed(i))
	test.append(temp)

	#Accounts for the different length of words in test data
	test = tf.keras.preprocessing.sequence.pad_sequences(test,dtype='float')
	from sklearn.model_selection import train_test_split

	#Split data into training and testing dataset
	title_train, title_test, y_train, y_test = train_test_split(titles, labels, test_size=0.2,\
	random_state=1000)
	import tensorflow_hub as hub
	import tensorflow as tf

	embed = hub.load("https://tfhub.dev/google/Wiki-words-250/2")

	#Convert each series of words to a word2vec embedding
	indiv = []
	for i in title_train:
	temp = np.array(embed(i))
	#Import nltk preprocessing library to convert text into a readable format
	import nltk
	from nltk.tokenize import sent_tokenize
	from nltk.stem import WordNetLemmatizer
	from nltk.corpus import stopwords

	nltk.download('punkt')
	nltk.download('wordnet')
	nltk.download('stopwords')
	import string

	#Lowercase letters
	data['title'] = data['title'].str.lower()
	data.head()


	#Ensure that all necessary punctuations are in one list
	#Include ' and " as they are not default
	punc = list(string.punctuation)
	# Character Length of Titles - Min, Mean, Max
	print('Mean Length', data['title'].apply(len).mean())
	print('Min Length', data['title'].apply(len).min())
	print('Max Length', data['title'].apply(len).max())

	#plotting the frequency of characters on a histogram
	import seaborn as sns

	x = data['title'].apply(len).plot.hist()