Skip to content

Instantly share code, notes, and snippets.

@Muhammad4hmed
Last active December 21, 2020 09:30
Show Gist options
  • Save Muhammad4hmed/1b672bc51c3efb878c32f7eccdc46713 to your computer and use it in GitHub Desktop.
Save Muhammad4hmed/1b672bc51c3efb878c32f7eccdc46713 to your computer and use it in GitHub Desktop.
# config.py
import transformers
# this is the maximum number of tokens in the sentence
MAX_LEN = 512
# batch sizes is small because model is huge!
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# let's train for a maximum of 10 epochs
EPOCHS = 10
# define path to BERT model files
BERT_PATH = "../input/bert_base_uncased/"
# this is where you want to save the model
MODEL_PATH = "model.bin"
# training file
TRAINING_FILE = "../input/imdb.csv"
# define the tokenizer
# we use tokenizer and model
# from huggingface's transformers
TOKENIZER = transformers.BertTokenizer.from_pretrained(
BERT_PATH,
do_lower_case=True
)
# dataset.py
import config
import torch
class BERTDataset:
def __init__(self, review, target):
"""
:param review: list or numpy array of strings
:param targets: list or numpy array which is binary
"""
self.review = review
self.target = target
# we fetch max len and tokenizer from config.py
self.tokenizer = config.TOKENIZER
self.max_len = config.MAX_LEN
def __len__(self):
# this returns the length of dataset
return len(self.review)
def __getitem__(self, item):
# for a given item index, return a dictionary
# of inputs
review = str(self.review[item])
review = " ".join(review.split())
# encode_plus comes from hugginface's transformers
# and exists for all tokenizers they offer
# it can be used to convert a given string
# to ids, mask and token type ids which are
# needed for models like BERT
# here, review is a string
inputs = self.tokenizer.encode_plus(
review,
None,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True,
)
# ids are ids of tokens generated
# after tokenizing reviews
ids = inputs["input_ids"]
# mask is 1 where we have input
# and 0 where we have padding
mask = inputs["attention_mask"]
# token type ids behave the same way as
# mask in this specific case
# in case of two sentences, this is 0
# for first sentence and 1 for second sentence
token_type_ids = inputs["token_type_ids"]
# now we return everything
# note that ids, mask and token_type_ids
# are all long datatypes and targets is float
return {
"ids": torch.tensor(
ids, dtype=torch.long
),
"mask": torch.tensor(
mask, dtype=torch.long
),
"token_type_ids": torch.tensor(
token_type_ids, dtype=torch.long
),
"targets": torch.tensor(
self.target[item], dtype=torch.float
)
}
# model.py
import config
import transformers
import torch.nn as nn
class BERTBaseUncased(nn.Module):
def __init__(self):
super(BERTBaseUncased, self).__init__()
# we fetch the model from the BERT_PATH defined in
# config.py
self.bert = transformers.BertModel.from_pretrained(
config.BERT_PATH
)
# add a dropout for regularization
self.bert_drop = nn.Dropout(0.3)
# a simple linear layer for output
# yes, there is only one output
self.out = nn.Linear(768, 1)
def forward(self, ids, mask, token_type_ids):
# BERT in its default settings returns two outputs
# last hidden state and output of bert pooler layer
# we use the output of the pooler which is of the size
# (batch_size, hidden_size)
# hidden size can be 768 or 1024 depending on
# if we are using bert base or large respectively
# in our case, it is 768
# note that this model is pretty simple
# you might want to use last hidden state
# or several hidden states
_, o2 = self.bert(
ids,
attention_mask=mask,
token_type_ids=token_type_ids
)
# pass through dropout layer
bo = self.bert_drop(o2)
# pass through linear layer
output = self.out(bo)
# return output
return output
# engine.py
import torch
import torch.nn as nn
def loss_fn(outputs, targets):
"""
This function returns the loss.
:param outputs: output from the model (real numbers)
:param targets: input targets (binary)
"""
return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))
def get_scheduler(optimizer, scheduler):
if scheduler=='ReduceLROnPlateau':
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=4, verbose=True, eps=1e-6)
elif scheduler=='CosineAnnealingLR':
scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-6, last_epoch=-1)
elif scheduler=='CosineAnnealingWarmRestarts':
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1)
return scheduler
def train_fn(data_loader, model, optimizer, device, scheduler):
"""
This is the training function which trains for one epoch
:param data_loader: it is the torch dataloader object
:param model: torch model, bert in our case
:param optimizer: adam, sgd, etc
:param device: can be cpu or cuda
:param scheduler: learning rate scheduler
"""
# put the model in training mode
model.train()
# loop over all batches
for d in data_loader:
# extract ids, token type ids and mask
# from current batch
# also extract targets
ids = d["ids"]
token_type_ids = d["token_type_ids"]
mask = d["mask"]
targets = d["targets"]
# move everything to specified device
ids = ids.to(device, dtype=torch.long)
token_type_ids = token_type_ids.to(device, dtype=torch.long)
mask = mask.to(device, dtype=torch.long)
targets = targets.to(device, dtype=torch.float)
# zero-grad the optimizer
optimizer.zero_grad()
# pass through the model
outputs = model(
ids=ids,
mask=mask,
token_type_ids=token_type_ids
)
# calculate loss
loss = loss_fn(outputs, targets)
# backward step the loss
loss.backward()
# step optimizer
optimizer.step()
# step scheduler
scheduler.step()
def eval_fn(data_loader, model, device):
"""
this is the validation function that generates
predictions on validation data
:param data_loader: it is the torch dataloader object
:param model: torch model, bert in our case
:param device: can be cpu or cuda
:return: output and targets
"""
# put model in eval mode
model.eval()
# initialize empty lists for
# targets and outputs
fin_targets = []
fin_outputs = []
# use the no_grad scope
# its very important else you might
# run out of gpu memory
with torch.no_grad():
# this part is same as training function
# except for the fact that there is no
# zero_grad of optimizer and there is no loss
# calculation or scheduler steps.
for d in data_loader:
ids = d["ids"]
token_type_ids = d["token_type_ids"]
mask = d["mask"]
targets = d["targets"]
ids = ids.to(device, dtype=torch.long)
token_type_ids = token_type_ids.to(device, dtype=torch.long)
mask = mask.to(device, dtype=torch.long)
targets = targets.to(device, dtype=torch.float)
outputs = model(
ids=ids,
mask=mask,
token_type_ids=token_type_ids
)
# convert targets to cpu and extend the final list
targets = targets.cpu().detach()
fin_targets.extend(targets.numpy().tolist())
# convert outputs to cpu and extend the final list
outputs = torch.sigmoid(outputs).cpu().detach()
fin_outputs.extend(outputs.numpy().tolist())
return fin_outputs, fin_targets
# train.py
import config
import dataset
import engine
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
from model import BERTBaseUncased
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
def train():
# this function trains the model
# read the training file and fill NaN values with "none"
# you can also choose to drop NaN values in this
# specific dataset
dfx = pd.read_csv(config.TRAINING_FILE).fillna("none")
# sentiment = 1 if its positive
# else sentiment = 0
dfx.sentiment = dfx.sentiment.apply(
lambda x: 1 if x == "positive" else 0
)
# we split the data into single training
# and validation fold
df_train, df_valid = model_selection.train_test_split(
dfx,
test_size=0.1,
random_state=42,
stratify=dfx.sentiment.values
)
# reset index
df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)
# initialize BERTDataset from dataset.py
# for training dataset
train_dataset = dataset.BERTDataset(
review=df_train.review.values,
target=df_train.sentiment.values
)
# create training dataloader
train_data_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=config.TRAIN_BATCH_SIZE,
num_workers=4
)
# initialize BERTDataset from dataset.py
# for validation dataset
valid_dataset = dataset.BERTDataset(
review=df_valid.review.values,
target=df_valid.sentiment.values
)
# create validation data loader
valid_data_loader = torch.utils.data.DataLoader(
valid_dataset,
batch_size=config.VALID_BATCH_SIZE,
num_workers=1
)
# initialize the cuda device
# use cpu if you dont have GPU
device = torch.device("cuda")
# load model and send it to the device
model = BERTBaseUncased()
model.to(device)
# create parameters we want to optimize
# we generally dont use any decay for bias
# and weight layers
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
{
"params": [
p for n, p in param_optimizer if
not any(nd in n for nd in no_decay)
],
"weight_decay": 0.001,
},
{
"params": [
p for n, p in param_optimizer if
any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
# calculate the number of training steps
# this is used by scheduler
num_train_steps = int(
len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS
)
# AdamW optimizer
# AdamW is the most widely used optimizer
# for transformer based networks
optimizer = AdamW(optimizer_parameters, lr=3e-5)
# fetch a scheduler
# you can also try using reduce lr on plateau
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=num_train_steps
)
# if you have multiple GPUs
# model model to DataParallel
# to use multiple GPUs
model = nn.DataParallel(model)
# start training the epochs
best_accuracy = 0
for epoch in range(config.EPOCHS):
engine.train_fn(
train_data_loader, model, optimizer, device, scheduler
)
outputs, targets = engine.eval_fn(
valid_data_loader, model, device
)
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
print(f"Accuracy Score = {accuracy}")
if accuracy > best_accuracy:
torch.save(model.state_dict(), config.MODEL_PATH)
best_accuracy = accuracy
# metrics.py
from sklearn import metrics
def multi_class_roc_auc(true, pred_probs_arr, labels):
auc_all = []
for label_number in labels:
true_labels = true.loc[:,label_number].copy()
pred_probs = pred_probs_arr.loc[:, label_number].copy()
#AUROC and AP (sliding across multiple decision thresholds)
fpr, tpr, thresholds = metrics.roc_curve(y_true = true_labels,
y_score = pred_probs,
pos_label = 1)
auc = metrics.auc(fpr, tpr)
auc_all.append(auc)
print(f'AUC of each class: {auc_all}')
return np.mean(auc_all)
if __name__ == "__main__":
def seed_torch(seed=42):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
seed_torch(seed=CFG.seed)
train()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment