Created
August 15, 2021 19:54
-
-
Save Muhammad4hmed/6d085c7539f2e5f0d5c4717dbb219e12 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import os | |
from transformers import * | |
from sklearn.model_selection import KFold, StratifiedKFold | |
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score | |
import random | |
import torch.nn.functional as F | |
import torch | |
import torch.nn as nn | |
import warnings | |
warnings.filterwarnings("ignore") | |
from torch.utils.data import DataLoader, Dataset | |
from tqdm import tqdm | |
def set_seed(seed = 0): | |
'''Sets the seed of the entire notebook so results are the same every time we run. | |
This is for REPRODUCIBILITY.''' | |
np.random.seed(seed) | |
random_state = np.random.RandomState(seed) | |
random.seed(seed) | |
torch.manual_seed(seed) | |
torch.cuda.manual_seed(seed) | |
torch.backends.cudnn.deterministic = True | |
torch.backends.cudnn.benchmark = False | |
os.environ['PYTHONHASHSEED'] = str(seed) | |
return random_state | |
seed = 42 | |
random_state = set_seed(seed) | |
if torch.cuda.is_available(): | |
device = torch.device("cuda") | |
print("GPU is available") | |
else: | |
device = torch.device("cpu") | |
print("GPU not available, CPU used") | |
df['fold'] = -1 | |
gkf = StratifiedKFold(n_splits=5) #, shuffle = True, random_state = 42) | |
for fold, (train, val) in enumerate(gkf.split(df.opcodes, df.target)): | |
df.loc[val,'fold']=fold | |
fold = 0 | |
validation_df = df[df.fold==fold].reset_index(drop=True) | |
train_df = df[df.fold!=fold].reset_index(drop=True) | |
class Data(Dataset): | |
def __init__(self, data): | |
super().__init__() | |
self.data = data | |
def __len__(self): | |
return len(self.data) | |
def __getitem__(self, idx): | |
opcodes = self.data.opcodes[idx] | |
target = self.data.target[idx] | |
return opcodes, target | |
train_data = Data(data = train_df) | |
train_loader = DataLoader(dataset = train_data, shuffle=True, batch_size = 8) | |
val_data = Data(data = validation_df) | |
val_loader = DataLoader(dataset = val_data, shuffle=False, batch_size = 8) | |
class ReadabilityModel(PreTrainedModel): | |
def __init__(self, conf): | |
super(ReadabilityModel, self).__init__(conf) | |
self.roberta = RobertaModel.from_pretrained(model_name, config=conf) | |
self.drop_out = nn.Dropout(0.1) | |
self.l1 = nn.Linear(1024 * 1, 1) | |
torch.nn.init.normal_(self.l1.weight, std=0.02) | |
def forward(self, ids, mask): | |
out = self.roberta( | |
input_ids=ids, | |
attention_mask=mask | |
) | |
out = out['hidden_states'] | |
out = out[-1] | |
out = self.drop_out(out) | |
out = torch.mean(out, 1, True) | |
preds = self.l1(out) | |
preds = preds.squeeze(-1).squeeze(-1) | |
preds = F.softmax(preds) | |
return preds | |
model_name = 'roberta-large' | |
tokenizer = RobertaTokenizerFast.from_pretrained(model_name) | |
model_config = RobertaConfig.from_pretrained(model_name) | |
model_config.output_hidden_states = True | |
model = ReadabilityModel(model_config) | |
model = model.to(device) | |
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01) | |
scheduler = get_constant_schedule_with_warmup(optimizer, 100) | |
loss_fct = nn.BCEWithLogitsLoss() | |
epochs = 30 | |
def get_preds(preds_val, thresh = 0.5): | |
return np.where(preds_val >= thresh, 1, 0) | |
best_accuracy = 0 | |
for epoch in range(epochs): | |
model.train() | |
for i, (opcodes, targets) in enumerate(tqdm(train_loader)): | |
optimizer.zero_grad() | |
batch = tokenizer(list(opcodes), truncation=True, padding=True, return_tensors='pt', add_special_tokens=True) | |
input_ids = batch['input_ids'] | |
input_ids = input_ids.to(device, dtype=torch.long) | |
attention_mask = batch['attention_mask'] | |
attention_mask = attention_mask.to(device, dtype=torch.long) | |
targets=torch.tensor(targets).to(device, dtype=torch.float) | |
preds = model(input_ids, attention_mask) | |
loss = loss_fct(preds, targets) | |
loss.backward() | |
optimizer.step() | |
scheduler.step() | |
loss = loss.item() | |
if i==0: | |
loss_train = loss | |
else: | |
loss_train = loss_train + loss | |
loss_train = loss_train/(i+1) | |
model.eval() | |
with torch.no_grad(): | |
for i, (opcodes, targets) in enumerate(tqdm(val_loader)): | |
optimizer.zero_grad() | |
batch = tokenizer(list(opcodes), truncation=True, padding=True, return_tensors='pt', add_special_tokens=True) | |
input_ids = batch['input_ids'] | |
input_ids = input_ids.to(device, dtype=torch.long) | |
attention_mask = batch['attention_mask'] | |
attention_mask = attention_mask.to(device, dtype=torch.long) | |
targets=torch.tensor(targets).to(device, dtype=torch.float) | |
preds = model(input_ids, attention_mask) | |
loss = loss_fct(preds, targets) | |
loss = loss.item() | |
preds = preds.cpu().detach().numpy() | |
targets = targets.cpu().detach().numpy() | |
if i==0: | |
loss_val = loss | |
preds_val = preds | |
targets_val = targets | |
else: | |
loss_val = loss_val + loss | |
preds_val = np.concatenate((preds_val,preds), axis=None) | |
targets_val = np.concatenate((targets_val,targets), axis=None) | |
loss_val = loss_val / (i+1) | |
preds_val = get_preds(preds_val) | |
accuracy = accuracy_score(targets_val, preds_val) | |
roc_auc = roc_auc_score(targets_val, preds_val) | |
print('Epoch: {} - Loss: {:.6f} - Loss val: {:.6f} - Accuracy: {:.3f} ROC_AUC_SCORE: {:.3f}'.format( | |
epoch + 1, loss_train, loss_val, accuracy, roc_auc)) | |
if accuracy > best_accuracy: | |
torch.save(model.state_dict(), 'roberta_baseline.bin') | |
best_accuracy = accuracy |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment