Skip to content

Instantly share code, notes, and snippets.

@Muhammad4hmed
Created August 15, 2021 19:54
Show Gist options
  • Save Muhammad4hmed/6d085c7539f2e5f0d5c4717dbb219e12 to your computer and use it in GitHub Desktop.
Save Muhammad4hmed/6d085c7539f2e5f0d5c4717dbb219e12 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import os
from transformers import *
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score
import random
import torch.nn.functional as F
import torch
import torch.nn as nn
import warnings
warnings.filterwarnings("ignore")
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
def set_seed(seed = 0):
'''Sets the seed of the entire notebook so results are the same every time we run.
This is for REPRODUCIBILITY.'''
np.random.seed(seed)
random_state = np.random.RandomState(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ['PYTHONHASHSEED'] = str(seed)
return random_state
seed = 42
random_state = set_seed(seed)
if torch.cuda.is_available():
device = torch.device("cuda")
print("GPU is available")
else:
device = torch.device("cpu")
print("GPU not available, CPU used")
df['fold'] = -1
gkf = StratifiedKFold(n_splits=5) #, shuffle = True, random_state = 42)
for fold, (train, val) in enumerate(gkf.split(df.opcodes, df.target)):
df.loc[val,'fold']=fold
fold = 0
validation_df = df[df.fold==fold].reset_index(drop=True)
train_df = df[df.fold!=fold].reset_index(drop=True)
class Data(Dataset):
def __init__(self, data):
super().__init__()
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
opcodes = self.data.opcodes[idx]
target = self.data.target[idx]
return opcodes, target
train_data = Data(data = train_df)
train_loader = DataLoader(dataset = train_data, shuffle=True, batch_size = 8)
val_data = Data(data = validation_df)
val_loader = DataLoader(dataset = val_data, shuffle=False, batch_size = 8)
class ReadabilityModel(PreTrainedModel):
def __init__(self, conf):
super(ReadabilityModel, self).__init__(conf)
self.roberta = RobertaModel.from_pretrained(model_name, config=conf)
self.drop_out = nn.Dropout(0.1)
self.l1 = nn.Linear(1024 * 1, 1)
torch.nn.init.normal_(self.l1.weight, std=0.02)
def forward(self, ids, mask):
out = self.roberta(
input_ids=ids,
attention_mask=mask
)
out = out['hidden_states']
out = out[-1]
out = self.drop_out(out)
out = torch.mean(out, 1, True)
preds = self.l1(out)
preds = preds.squeeze(-1).squeeze(-1)
preds = F.softmax(preds)
return preds
model_name = 'roberta-large'
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
model_config = RobertaConfig.from_pretrained(model_name)
model_config.output_hidden_states = True
model = ReadabilityModel(model_config)
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
scheduler = get_constant_schedule_with_warmup(optimizer, 100)
loss_fct = nn.BCEWithLogitsLoss()
epochs = 30
def get_preds(preds_val, thresh = 0.5):
return np.where(preds_val >= thresh, 1, 0)
best_accuracy = 0
for epoch in range(epochs):
model.train()
for i, (opcodes, targets) in enumerate(tqdm(train_loader)):
optimizer.zero_grad()
batch = tokenizer(list(opcodes), truncation=True, padding=True, return_tensors='pt', add_special_tokens=True)
input_ids = batch['input_ids']
input_ids = input_ids.to(device, dtype=torch.long)
attention_mask = batch['attention_mask']
attention_mask = attention_mask.to(device, dtype=torch.long)
targets=torch.tensor(targets).to(device, dtype=torch.float)
preds = model(input_ids, attention_mask)
loss = loss_fct(preds, targets)
loss.backward()
optimizer.step()
scheduler.step()
loss = loss.item()
if i==0:
loss_train = loss
else:
loss_train = loss_train + loss
loss_train = loss_train/(i+1)
model.eval()
with torch.no_grad():
for i, (opcodes, targets) in enumerate(tqdm(val_loader)):
optimizer.zero_grad()
batch = tokenizer(list(opcodes), truncation=True, padding=True, return_tensors='pt', add_special_tokens=True)
input_ids = batch['input_ids']
input_ids = input_ids.to(device, dtype=torch.long)
attention_mask = batch['attention_mask']
attention_mask = attention_mask.to(device, dtype=torch.long)
targets=torch.tensor(targets).to(device, dtype=torch.float)
preds = model(input_ids, attention_mask)
loss = loss_fct(preds, targets)
loss = loss.item()
preds = preds.cpu().detach().numpy()
targets = targets.cpu().detach().numpy()
if i==0:
loss_val = loss
preds_val = preds
targets_val = targets
else:
loss_val = loss_val + loss
preds_val = np.concatenate((preds_val,preds), axis=None)
targets_val = np.concatenate((targets_val,targets), axis=None)
loss_val = loss_val / (i+1)
preds_val = get_preds(preds_val)
accuracy = accuracy_score(targets_val, preds_val)
roc_auc = roc_auc_score(targets_val, preds_val)
print('Epoch: {} - Loss: {:.6f} - Loss val: {:.6f} - Accuracy: {:.3f} ROC_AUC_SCORE: {:.3f}'.format(
epoch + 1, loss_train, loss_val, accuracy, roc_auc))
if accuracy > best_accuracy:
torch.save(model.state_dict(), 'roberta_baseline.bin')
best_accuracy = accuracy
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment