Skip to content

Instantly share code, notes, and snippets.

@jinhangjiang
Last active September 13, 2023 22:33
Show Gist options
  • Save jinhangjiang/2fa43ee2ee862805c694ea98f3283947 to your computer and use it in GitHub Desktop.
Save jinhangjiang/2fa43ee2ee862805c694ea98f3283947 to your computer and use it in GitHub Desktop.
transformers_linear_regression
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
print('Not connected to a GPU')
else:
print(gpu_info)
test_embeddings = tokenizer(test_reviews_3000.reviews.astype(str).tolist(), truncation=True, padding=True, max_length=126)
test_dataset = MakeTorchData(test_embeddings, test_reviews_3000.Rating.astype(float))
bert_trainer.eval_dataset = test_dataset
bert_trainer.evaluate()
{'epoch': 5.0,
'eval_accuracy': 0.6603333333333333,
'eval_loss': 0.48832669854164124,
'eval_mae': 0.45958763360977173,
'eval_mse': 0.4883267283439636,
'eval_r2': 0.6103774787626888,
'eval_rmse': 0.6988037824630737,
'eval_runtime': 13.5094,
'eval_samples_per_second': 222.068,
'eval_steps_per_second': 11.103}
import torch, gc, random, datasets
from transformers.file_utils import is_tf_available, is_torch_available
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
%load_ext memory_profiler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np
# Make data
X = Data
y = Target
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X.tolist(), y, test_size=test_size)
# Call the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Encode the text
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)
class MakeTorchData(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
item["labels"] = torch.tensor([self.labels[idx]])
item["labels"] = float(item["labels"])
return item
def __len__(self):
return len(self.labels)
# convert our tokenized data into a torch Dataset
train_dataset = MakeTorchData(train_encodings, y_train.ravel())
valid_dataset = MakeTorchData(valid_encodings, y_test.ravel())
model = AutoModelForSequenceClassification.from_pretrained(model_name,
num_labels = 1).to("cuda")
def compute_metrics_for_regression(eval_pred):
logits, labels = eval_pred
labels = labels.reshape(-1, 1)
mse = mean_squared_error(labels, logits)
rmse = mean_squared_error(labels, logits, squared=False)
mae = mean_absolute_error(labels, logits)
r2 = r2_score(labels, logits)
smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)
return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}
# Specifiy the arguments for the trainer
training_args = TrainingArguments(
output_dir ='./results',
num_train_epochs = num_epochs,
per_device_train_batch_size = 64,
per_device_eval_batch_size = 20,
weight_decay = 0.01,
learning_rate = 2e-5,
logging_dir = './logs',
save_total_limit = 10,
load_best_model_at_end = True,
metric_for_best_model = 'rmse',
evaluation_strategy = "epoch",
save_strategy = "epoch",
)
# Call the Trainer
trainer = Trainer(
model = model,
args = training_args,
train_dataset = train_dataset,
eval_dataset = valid_dataset,
compute_metrics = compute_metrics_for_regression,
)
# Train the model
trainer.train()
# Call the summary
trainer.evaluate()
Epoch Training Loss Validation Loss Mse Rmse Mae R2 Smape
1 No log 0.192932 0.192932 0.439241 0.390277 -2.478255 13.924477
2 No log 0.049018 0.049018 0.221400 0.185570 0.116285 7.139155
3 No log 0.083286 0.083286 0.288593 0.219865 -0.501508 8.309401
... ...
44 0.012600 0.029716 0.029716 0.172384 0.132232 0.464267 5.152074
def compute_metrics_for_regression(eval_pred):
...
...
single_squared_errors = ((logits - labels).flatten()**2).tolist()
accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "accuracy": accuracy}
Epoch Training Loss Validation Loss Mse Rmse Mae R2 Accuracy
1 No log 0.507760 0.507760 0.712573 0.491016 0.581568 0.653206
2 No log 0.434158 0.434158 0.658906 0.438609 0.642221 0.662969
3 No log 0.469371 0.469371 0.685106 0.449530 0.613203 0.669477
4 No log 0.440199 0.440199 0.663475 0.432040 0.637242 0.668738
5 0.769400 0.447230 0.447231 0.668753 0.439436 0.631448 0.669107
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment