Skip to content

Instantly share code, notes, and snippets.

@pranavraikote
Created November 19, 2021 02:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pranavraikote/fe5255b4567ba13a0325b11fe73c2b83 to your computer and use it in GitHub Desktop.
Save pranavraikote/fe5255b4567ba13a0325b11fe73c2b83 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Input
from transformers import BertTokenizer
from transformers import TFAutoModel
data = pd.read_csv('IMDB Dataset.csv')
data = data.sample(frac = 1).reset_index(drop = True)
data['one_hot'] = data['sentiment'].apply(lambda x: [0,1] if x == 'positive' else [1,0])
seq_len = 512
num_samples = len(data)
X_ids = np.zeros((num_samples, seq_len))
X_mask = np.zeros((num_samples, seq_len))
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
for i, phrase in enumerate(data['review']):
tokens = tokenizer.encode_plus(phrase, max_length=seq_len, truncation=True,
padding='max_length', add_special_tokens=True,
return_tensors='tf')
X_ids[i, :] = tokens['input_ids']
X_mask[i, :] = tokens['attention_mask']
def map_f(input_ids, masks, labels):
return {'input_ids' : input_ids, 'attention_mask' : masks}, labels
labels = list(data.one_hot.values)
dataset = tf.data.Dataset.from_tensor_slices((X_ids, X_mask, labels))
dataset = dataset.map(map_f)
dataset = dataset.batch(8, drop_remainder = True)
size = int((X_ids.shape[0]/16)*0.9)
train_ds = dataset.take(size)
test_ds = dataset.skip(size)
bert = TFAutoModel.from_pretrained('bert-base-cased')
input_ids = Input(shape=(512,))
mask = Input(shape=(512,))
embeddings = bert.bert(input_ids, attention_mask=mask)[1]
fc_1 = tf.keras.layers.Dense(512, activation='relu')(embeddings)
fc_2 = tf.keras.layers.Dense(2, activation='softmax', name='outputs')(fc_1)
model = tf.keras.Model(inputs = [input_ids, mask], outputs = fc_1)
model.layers[2].trainable = False
opt = Adam(learning_rate = 1e-5, decay = 1e-6)
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
model.compile(optimizer = opt, loss = 'categorical_crossentropy, metrics=[acc])
model.summary()
history = model.fit(train_ds, epochs = 3)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
def predictions(text):
tokens = tokenizer.encode_plus(text, max_length = 512, truncation = True, padding = 'max_length',
add_special_tokens = True, return_token_type_ids = False, return_tensors = 'tf')
return {'input_ids': tf.cast(tokens['input_ids'], tf.float64), 'attention_mask': tf.cast(tokens['attention_mask'], tf.float64)}
new = predictions("This is a good movie")
print(new)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment