Muhammad4hmed/Transformers_Pipeline_Pytorch.py

## Transformers_Pipeline_Pytorch.py
# config.py
import transformers
# this is the maximum number of tokens in the sentence
MAX_LEN = 512
# batch sizes is small because model is huge!
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# let's train for a maximum of 10 epochs
EPOCHS = 10
# define path to BERT model files
BERT_PATH = "../input/bert_base_uncased/"
# this is where you want to save the model
MODEL_PATH = "model.bin"
# training file
TRAINING_FILE = "../input/imdb.csv"
# define the tokenizer
# we use tokenizer and model
# from huggingface's transformers
TOKENIZER = transformers.BertTokenizer.from_pretrained(
  BERT_PATH,
  do_lower_case=True
)

# dataset.py
import config
import torch
class BERTDataset:
  def __init__(self, review, target):
  """
    :param review: list or numpy array of strings
    :param targets: list or numpy array which is binary
  """
    self.review = review
    self.target = target
    # we fetch max len and tokenizer from config.py
    self.tokenizer = config.TOKENIZER
    self.max_len = config.MAX_LEN

  def __len__(self):
    # this returns the length of dataset
    return len(self.review)

  def __getitem__(self, item):
    # for a given item index, return a dictionary
    # of inputs
    review = str(self.review[item])
    review = " ".join(review.split())
    # encode_plus comes from hugginface's transformers
    # and exists for all tokenizers they offer
    # it can be used to convert a given string
    # to ids, mask and token type ids which are
    # needed for models like BERT
    # here, review is a string
    inputs = self.tokenizer.encode_plus(
      review,
      None,
      add_special_tokens=True,
      max_length=self.max_len,
      pad_to_max_length=True,
    )
    # ids are ids of tokens generated
    # after tokenizing reviews
    ids = inputs["input_ids"]
    # mask is 1 where we have input
    # and 0 where we have padding
    mask = inputs["attention_mask"]
    # token type ids behave the same way as
    # mask in this specific case
    # in case of two sentences, this is 0
    # for first sentence and 1 for second sentence
    token_type_ids = inputs["token_type_ids"]
    # now we return everything
    # note that ids, mask and token_type_ids
    # are all long datatypes and targets is float
    return {
      "ids": torch.tensor(
          ids, dtype=torch.long
        ),
      "mask": torch.tensor(
          mask, dtype=torch.long
      ),
      "token_type_ids": torch.tensor(
          token_type_ids, dtype=torch.long
      ),
      "targets": torch.tensor(
          self.target[item], dtype=torch.float
      )
    }

# model.py
import config
import transformers
import torch.nn as nn
class BERTBaseUncased(nn.Module):
  def __init__(self):
    super(BERTBaseUncased, self).__init__()
    # we fetch the model from the BERT_PATH defined in
    # config.py
    self.bert = transformers.BertModel.from_pretrained(
      config.BERT_PATH
    )
    # add a dropout for regularization
    self.bert_drop = nn.Dropout(0.3)
    # a simple linear layer for output
    # yes, there is only one output
    self.out = nn.Linear(768, 1)

  def forward(self, ids, mask, token_type_ids):
    # BERT in its default settings returns two outputs
    # last hidden state and output of bert pooler layer
    # we use the output of the pooler which is of the size
    # (batch_size, hidden_size)
    # hidden size can be 768 or 1024 depending on
    # if we are using bert base or large respectively
    # in our case, it is 768
    # note that this model is pretty simple
    # you might want to use last hidden state
    # or several hidden states
    _, o2 = self.bert(
      ids,
      attention_mask=mask,
      token_type_ids=token_type_ids
    )
    # pass through dropout layer
    bo = self.bert_drop(o2)
    # pass through linear layer
    output = self.out(bo)
    # return output
    return output

# engine.py
import torch
import torch.nn as nn

def loss_fn(outputs, targets):
  """
    This function returns the loss.
    :param outputs: output from the model (real numbers)
    :param targets: input targets (binary)
  """
  return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

def get_scheduler(optimizer, scheduler):
  if scheduler=='ReduceLROnPlateau':
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=4, verbose=True, eps=1e-6)
  elif scheduler=='CosineAnnealingLR':
    scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-6, last_epoch=-1)
  elif scheduler=='CosineAnnealingWarmRestarts':
    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1)
  return scheduler


def train_fn(data_loader, model, optimizer, device, scheduler):
  """
    This is the training function which trains for one epoch
    :param data_loader: it is the torch dataloader object
    :param model: torch model, bert in our case
    :param optimizer: adam, sgd, etc
    :param device: can be cpu or cuda
    :param scheduler: learning rate scheduler
  """
  # put the model in training mode
  model.train()
  # loop over all batches
  for d in data_loader:
    # extract ids, token type ids and mask
    # from current batch
    # also extract targets
    ids = d["ids"]
    token_type_ids = d["token_type_ids"]
    mask = d["mask"]
    targets = d["targets"]
    # move everything to specified device
    ids = ids.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.float)
    # zero-grad the optimizer
    optimizer.zero_grad()
    # pass through the model
    outputs = model(
      ids=ids,
      mask=mask,
      token_type_ids=token_type_ids
    )
    # calculate loss
    loss = loss_fn(outputs, targets)
    # backward step the loss
    loss.backward()
    # step optimizer
    optimizer.step()
  # step scheduler
  scheduler.step()

def eval_fn(data_loader, model, device):
  """
    this is the validation function that generates
    predictions on validation data
    :param data_loader: it is the torch dataloader object
    :param model: torch model, bert in our case
    :param device: can be cpu or cuda
    :return: output and targets
  """
  # put model in eval mode
  model.eval()
  # initialize empty lists for
  # targets and outputs
  fin_targets = []
  fin_outputs = []
  # use the no_grad scope
  # its very important else you might
  # run out of gpu memory
  with torch.no_grad():
    # this part is same as training function
    # except for the fact that there is no
    # zero_grad of optimizer and there is no loss
    # calculation or scheduler steps.
    for d in data_loader:
      ids = d["ids"]
      token_type_ids = d["token_type_ids"]
      mask = d["mask"]
      targets = d["targets"]
      ids = ids.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)
      targets = targets.to(device, dtype=torch.float)
      outputs = model(
        ids=ids,
        mask=mask,
        token_type_ids=token_type_ids
      )
      # convert targets to cpu and extend the final list
      targets = targets.cpu().detach()
      fin_targets.extend(targets.numpy().tolist())
      # convert outputs to cpu and extend the final list
      outputs = torch.sigmoid(outputs).cpu().detach()
      fin_outputs.extend(outputs.numpy().tolist())
  return fin_outputs, fin_targets


# train.py
import config
import dataset
import engine
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
from model import BERTBaseUncased
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

def train():
  # this function trains the model

  # read the training file and fill NaN values with "none"
  # you can also choose to drop NaN values in this
  # specific dataset
  dfx = pd.read_csv(config.TRAINING_FILE).fillna("none")
  # sentiment = 1 if its positive
  # else sentiment = 0
  dfx.sentiment = dfx.sentiment.apply(
    lambda x: 1 if x == "positive" else 0
  )
  # we split the data into single training
  # and validation fold
  df_train, df_valid = model_selection.train_test_split(
    dfx,
    test_size=0.1,
    random_state=42,
    stratify=dfx.sentiment.values
  )
  # reset index
  df_train = df_train.reset_index(drop=True)
  df_valid = df_valid.reset_index(drop=True)
  # initialize BERTDataset from dataset.py
  # for training dataset
  train_dataset = dataset.BERTDataset(
    review=df_train.review.values,
    target=df_train.sentiment.values
  )
  # create training dataloader
  train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=config.TRAIN_BATCH_SIZE,
    num_workers=4
  )
  # initialize BERTDataset from dataset.py
  # for validation dataset
  valid_dataset = dataset.BERTDataset(
    review=df_valid.review.values,
    target=df_valid.sentiment.values
  )
  # create validation data loader
  valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=config.VALID_BATCH_SIZE,
    num_workers=1
  )
  # initialize the cuda device
  # use cpu if you dont have GPU
  device = torch.device("cuda")
  # load model and send it to the device
  model = BERTBaseUncased()
  model.to(device)
  # create parameters we want to optimize
  # we generally dont use any decay for bias
  # and weight layers
  param_optimizer = list(model.named_parameters())
  no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

  optimizer_parameters = [
    {
      "params": [
        p for n, p in param_optimizer if
        not any(nd in n for nd in no_decay)
      ],
      "weight_decay": 0.001,
    },
    {
      "params": [
        p for n, p in param_optimizer if
        any(nd in n for nd in no_decay)
      ],
      "weight_decay": 0.0,
    },
  ]
  # calculate the number of training steps
  # this is used by scheduler
  num_train_steps = int(
    len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS
  )
  # AdamW optimizer
  # AdamW is the most widely used optimizer
  # for transformer based networks
  optimizer = AdamW(optimizer_parameters, lr=3e-5)
  # fetch a scheduler
  # you can also try using reduce lr on plateau
  scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps
  )
  # if you have multiple GPUs
  # model model to DataParallel
  # to use multiple GPUs
  model = nn.DataParallel(model)
  # start training the epochs
  best_accuracy = 0
  for epoch in range(config.EPOCHS):
    engine.train_fn(
      train_data_loader, model, optimizer, device, scheduler
    )
    outputs, targets = engine.eval_fn(
      valid_data_loader, model, device
    )
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    print(f"Accuracy Score = {accuracy}")
    if accuracy > best_accuracy:
      torch.save(model.state_dict(), config.MODEL_PATH)
      best_accuracy = accuracy

# metrics.py
from sklearn import metrics
def multi_class_roc_auc(true, pred_probs_arr, labels):
  auc_all = []
  for label_number in labels:
      true_labels = true.loc[:,label_number].copy()
      pred_probs = pred_probs_arr.loc[:, label_number].copy()

     #AUROC and AP (sliding across multiple decision thresholds)
      fpr, tpr, thresholds = metrics.roc_curve(y_true = true_labels,
                                       y_score = pred_probs,
                                       pos_label = 1)
      auc = metrics.auc(fpr, tpr)
      auc_all.append(auc)
  print(f'AUC of each class: {auc_all}')
  return np.mean(auc_all)

if __name__ == "__main__":
  def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

  seed_torch(seed=CFG.seed)
  train()
	# config.py
	import transformers
	# this is the maximum number of tokens in the sentence
	MAX_LEN = 512
	# batch sizes is small because model is huge!
	TRAIN_BATCH_SIZE = 8
	VALID_BATCH_SIZE = 4
	# let's train for a maximum of 10 epochs
	EPOCHS = 10
	# define path to BERT model files
	BERT_PATH = "../input/bert_base_uncased/"
	# this is where you want to save the model
	MODEL_PATH = "model.bin"
	# training file
	TRAINING_FILE = "../input/imdb.csv"
	# define the tokenizer
	# we use tokenizer and model
	# from huggingface's transformers
	TOKENIZER = transformers.BertTokenizer.from_pretrained(
	BERT_PATH,
	do_lower_case=True
	)

	# dataset.py
	import config
	import torch
	class BERTDataset:
	def __init__(self, review, target):
	"""
	:param review: list or numpy array of strings
	:param targets: list or numpy array which is binary
	"""
	self.review = review
	self.target = target
	# we fetch max len and tokenizer from config.py
	self.tokenizer = config.TOKENIZER
	self.max_len = config.MAX_LEN

	def __len__(self):
	# this returns the length of dataset
	return len(self.review)

	def __getitem__(self, item):
	# for a given item index, return a dictionary
	# of inputs
	review = str(self.review[item])
	review = " ".join(review.split())
	# encode_plus comes from hugginface's transformers
	# and exists for all tokenizers they offer
	# it can be used to convert a given string
	# to ids, mask and token type ids which are
	# needed for models like BERT
	# here, review is a string
	inputs = self.tokenizer.encode_plus(
	review,
	None,
	add_special_tokens=True,
	max_length=self.max_len,
	pad_to_max_length=True,
	)
	# ids are ids of tokens generated
	# after tokenizing reviews
	ids = inputs["input_ids"]
	# mask is 1 where we have input
	# and 0 where we have padding
	mask = inputs["attention_mask"]
	# token type ids behave the same way as
	# mask in this specific case
	# in case of two sentences, this is 0
	# for first sentence and 1 for second sentence
	token_type_ids = inputs["token_type_ids"]
	# now we return everything
	# note that ids, mask and token_type_ids
	# are all long datatypes and targets is float
	return {
	"ids": torch.tensor(
	ids, dtype=torch.long
	),
	"mask": torch.tensor(
	mask, dtype=torch.long
	),
	"token_type_ids": torch.tensor(
	token_type_ids, dtype=torch.long
	),
	"targets": torch.tensor(
	self.target[item], dtype=torch.float
	)
	}

	# model.py
	import config
	import transformers
	import torch.nn as nn
	class BERTBaseUncased(nn.Module):
	def __init__(self):
	super(BERTBaseUncased, self).__init__()
	# we fetch the model from the BERT_PATH defined in
	# config.py
	self.bert = transformers.BertModel.from_pretrained(
	config.BERT_PATH
	)
	# add a dropout for regularization
	self.bert_drop = nn.Dropout(0.3)
	# a simple linear layer for output
	# yes, there is only one output
	self.out = nn.Linear(768, 1)

	def forward(self, ids, mask, token_type_ids):
	# BERT in its default settings returns two outputs
	# last hidden state and output of bert pooler layer
	# we use the output of the pooler which is of the size
	# (batch_size, hidden_size)
	# hidden size can be 768 or 1024 depending on
	# if we are using bert base or large respectively
	# in our case, it is 768
	# note that this model is pretty simple
	# you might want to use last hidden state
	# or several hidden states
	_, o2 = self.bert(
	ids,
	attention_mask=mask,
	token_type_ids=token_type_ids
	)
	# pass through dropout layer
	bo = self.bert_drop(o2)
	# pass through linear layer
	output = self.out(bo)
	# return output
	return output

	# engine.py
	import torch
	import torch.nn as nn

	def loss_fn(outputs, targets):
	"""
	This function returns the loss.
	:param outputs: output from the model (real numbers)
	:param targets: input targets (binary)
	"""
	return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

	def get_scheduler(optimizer, scheduler):
	if scheduler=='ReduceLROnPlateau':
	scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=4, verbose=True, eps=1e-6)
	elif scheduler=='CosineAnnealingLR':
	scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-6, last_epoch=-1)
	elif scheduler=='CosineAnnealingWarmRestarts':
	scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1)
	return scheduler


	def train_fn(data_loader, model, optimizer, device, scheduler):
	"""
	This is the training function which trains for one epoch
	:param data_loader: it is the torch dataloader object
	:param model: torch model, bert in our case
	:param optimizer: adam, sgd, etc
	:param device: can be cpu or cuda
	:param scheduler: learning rate scheduler
	"""
	# put the model in training mode
	model.train()
	# loop over all batches
	for d in data_loader:
	# extract ids, token type ids and mask
	# from current batch
	# also extract targets
	ids = d["ids"]
	token_type_ids = d["token_type_ids"]
	mask = d["mask"]
	targets = d["targets"]
	# move everything to specified device
	ids = ids.to(device, dtype=torch.long)
	token_type_ids = token_type_ids.to(device, dtype=torch.long)
	mask = mask.to(device, dtype=torch.long)
	targets = targets.to(device, dtype=torch.float)
	# zero-grad the optimizer
	optimizer.zero_grad()
	# pass through the model
	outputs = model(
	ids=ids,
	mask=mask,
	token_type_ids=token_type_ids
	)
	# calculate loss
	loss = loss_fn(outputs, targets)
	# backward step the loss
	loss.backward()
	# step optimizer
	optimizer.step()
	# step scheduler
	scheduler.step()

	def eval_fn(data_loader, model, device):
	"""
	this is the validation function that generates
	predictions on validation data
	:param data_loader: it is the torch dataloader object
	:param model: torch model, bert in our case
	:param device: can be cpu or cuda
	:return: output and targets
	"""
	# put model in eval mode
	model.eval()
	# initialize empty lists for
	# targets and outputs
	fin_targets = []
	fin_outputs = []
	# use the no_grad scope
	# its very important else you might
	# run out of gpu memory
	with torch.no_grad():
	# this part is same as training function
	# except for the fact that there is no
	# zero_grad of optimizer and there is no loss
	# calculation or scheduler steps.
	for d in data_loader:
	ids = d["ids"]
	token_type_ids = d["token_type_ids"]
	mask = d["mask"]
	targets = d["targets"]
	ids = ids.to(device, dtype=torch.long)
	token_type_ids = token_type_ids.to(device, dtype=torch.long)
	mask = mask.to(device, dtype=torch.long)
	targets = targets.to(device, dtype=torch.float)
	outputs = model(
	ids=ids,
	mask=mask,
	token_type_ids=token_type_ids
	)
	# convert targets to cpu and extend the final list
	targets = targets.cpu().detach()
	fin_targets.extend(targets.numpy().tolist())
	# convert outputs to cpu and extend the final list
	outputs = torch.sigmoid(outputs).cpu().detach()
	fin_outputs.extend(outputs.numpy().tolist())
	return fin_outputs, fin_targets


	# train.py
	import config
	import dataset
	import engine
	import torch
	import pandas as pd
	import torch.nn as nn
	import numpy as np
	from model import BERTBaseUncased
	from sklearn import model_selection
	from sklearn import metrics
	from transformers import AdamW
	from transformers import get_linear_schedule_with_warmup

	def train():
	# this function trains the model

	# read the training file and fill NaN values with "none"
	# you can also choose to drop NaN values in this
	# specific dataset
	dfx = pd.read_csv(config.TRAINING_FILE).fillna("none")
	# sentiment = 1 if its positive
	# else sentiment = 0
	dfx.sentiment = dfx.sentiment.apply(
	lambda x: 1 if x == "positive" else 0
	)
	# we split the data into single training
	# and validation fold
	df_train, df_valid = model_selection.train_test_split(
	dfx,
	test_size=0.1,
	random_state=42,
	stratify=dfx.sentiment.values
	)
	# reset index
	df_train = df_train.reset_index(drop=True)
	df_valid = df_valid.reset_index(drop=True)
	# initialize BERTDataset from dataset.py
	# for training dataset
	train_dataset = dataset.BERTDataset(
	review=df_train.review.values,
	target=df_train.sentiment.values
	)
	# create training dataloader
	train_data_loader = torch.utils.data.DataLoader(
	train_dataset,
	batch_size=config.TRAIN_BATCH_SIZE,
	num_workers=4
	)
	# initialize BERTDataset from dataset.py
	# for validation dataset
	valid_dataset = dataset.BERTDataset(
	review=df_valid.review.values,
	target=df_valid.sentiment.values
	)
	# create validation data loader
	valid_data_loader = torch.utils.data.DataLoader(
	valid_dataset,
	batch_size=config.VALID_BATCH_SIZE,
	num_workers=1
	)
	# initialize the cuda device
	# use cpu if you dont have GPU
	device = torch.device("cuda")
	# load model and send it to the device
	model = BERTBaseUncased()
	model.to(device)
	# create parameters we want to optimize
	# we generally dont use any decay for bias
	# and weight layers
	param_optimizer = list(model.named_parameters())
	no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

	optimizer_parameters = [
	{
	"params": [
	p for n, p in param_optimizer if
	not any(nd in n for nd in no_decay)
	],
	"weight_decay": 0.001,
	},
	{
	"params": [
	p for n, p in param_optimizer if
	any(nd in n for nd in no_decay)
	],
	"weight_decay": 0.0,
	},
	]
	# calculate the number of training steps
	# this is used by scheduler
	num_train_steps = int(
	len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS
	)
	# AdamW optimizer
	# AdamW is the most widely used optimizer
	# for transformer based networks
	optimizer = AdamW(optimizer_parameters, lr=3e-5)
	# fetch a scheduler
	# you can also try using reduce lr on plateau
	scheduler = get_linear_schedule_with_warmup(
	optimizer,
	num_warmup_steps=0,
	num_training_steps=num_train_steps
	)
	# if you have multiple GPUs
	# model model to DataParallel
	# to use multiple GPUs
	model = nn.DataParallel(model)
	# start training the epochs
	best_accuracy = 0
	for epoch in range(config.EPOCHS):
	engine.train_fn(
	train_data_loader, model, optimizer, device, scheduler
	)
	outputs, targets = engine.eval_fn(
	valid_data_loader, model, device
	)
	outputs = np.array(outputs) >= 0.5
	accuracy = metrics.accuracy_score(targets, outputs)
	print(f"Accuracy Score = {accuracy}")
	if accuracy > best_accuracy:
	torch.save(model.state_dict(), config.MODEL_PATH)
	best_accuracy = accuracy

	# metrics.py
	from sklearn import metrics
	def multi_class_roc_auc(true, pred_probs_arr, labels):
	auc_all = []
	for label_number in labels:
	true_labels = true.loc[:,label_number].copy()
	pred_probs = pred_probs_arr.loc[:, label_number].copy()

	#AUROC and AP (sliding across multiple decision thresholds)
	fpr, tpr, thresholds = metrics.roc_curve(y_true = true_labels,
	y_score = pred_probs,
	pos_label = 1)
	auc = metrics.auc(fpr, tpr)
	auc_all.append(auc)
	print(f'AUC of each class: {auc_all}')
	return np.mean(auc_all)

	if __name__ == "__main__":
	def seed_torch(seed=42):
	random.seed(seed)
	os.environ['PYTHONHASHSEED'] = str(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	torch.backends.cudnn.deterministic = True

	seed_torch(seed=CFG.seed)
	train()