Created
December 27, 2020 06:46
-
-
Save lppier/0f10d3a9d13c76c24f65a77b3d02b76a to your computer and use it in GitHub Desktop.
Fine-tuning for HuggingFace Sentiment Classifier https://lppier.github.io/NLP-PK!-HuggingFace-vs-AWS-ML-Services
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
from sklearn.model_selection import train_test_split | |
from transformers import DistilBertTokenizerFast | |
import torch | |
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments | |
import torch.nn.functional as F | |
from sklearn.metrics import accuracy_score, precision_recall_fscore_support | |
# IMDB Dataset can be found here | |
# wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz | |
# tar -xf aclImdb_v1.tar.gz | |
def compute_metrics(pred): | |
labels = pred.label_ids | |
preds = pred.predictions.argmax(-1) | |
precision, recall, f1, _ = precision_recall_fscore_support( | |
labels, preds, average="binary" | |
) | |
acc = accuracy_score(labels, preds) | |
return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall} | |
def read_imdb_split(split_dir): | |
split_dir = Path(split_dir) | |
texts = [] | |
labels = [] | |
for label_dir in ["pos", "neg"]: | |
for text_file in (split_dir / label_dir).iterdir(): | |
texts.append(text_file.read_text(encoding="utf8")) | |
labels.append(0 if label_dir is "neg" else 1) | |
return texts, labels | |
train_texts, train_labels = read_imdb_split("data/aclImdb/train") | |
test_texts, test_labels = read_imdb_split("data/aclImdb/test") | |
# Further split training set to get a validation set | |
train_texts, val_texts, train_labels, val_labels = train_test_split( | |
train_texts, train_labels, test_size=0.1 | |
) | |
# Get BERT Tokens | |
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") | |
train_encodings = tokenizer(train_texts, truncation=True, padding=True) | |
val_encodings = tokenizer(val_texts, truncation=True, padding=True) | |
test_encodings = tokenizer(test_texts, truncation=True, padding=True) | |
class IMDbDataset(torch.utils.data.Dataset): | |
def __init__(self, encodings, labels): | |
self.encodings = encodings | |
self.labels = labels | |
def __getitem__(self, idx): | |
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | |
item["labels"] = torch.tensor(self.labels[idx]) | |
return item | |
def __len__(self): | |
return len(self.labels) | |
train_dataset = IMDbDataset(train_encodings, train_labels) | |
val_dataset = IMDbDataset(val_encodings, val_labels) | |
test_dataset = IMDbDataset(test_encodings, test_labels) | |
training_args = TrainingArguments( | |
output_dir="./results", # output directory | |
num_train_epochs=1, # total number of training epochs | |
per_device_train_batch_size=16, # batch size per device during training | |
per_device_eval_batch_size=64, # batch size for evaluation | |
warmup_steps=500, # number of warmup steps for learning rate scheduler | |
weight_decay=0.01, # strength of weight decay | |
logging_dir="./logs", # directory for storing logs | |
logging_steps=10, | |
) | |
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") | |
trainer = Trainer( | |
model=model, # the instantiated 🤗 Transformers model to be trained | |
args=training_args, # training arguments, defined above | |
train_dataset=train_dataset, # training dataset | |
eval_dataset=val_dataset, # evaluation dataset | |
compute_metrics=compute_metrics, | |
) | |
trainer.train() # this saves the models in the checkpoints under results folder | |
# Inference Code | |
import pandas as pd | |
model = DistilBertForSequenceClassification.from_pretrained("./results_old/checkpoint-3500") | |
df_labels = pd.read_csv("data/comprehendimdbtest.csv", header=None) | |
test_data = df_labels.iloc[:,1].to_list() | |
predictions = [] | |
# Need this loop otherwise my cpu memory just loads up | |
for text in test_data: | |
encoding = tokenizer(text, return_tensors="pt", padding=True, truncation=True) | |
outputs = model(**encoding) | |
pt_predictions = F.softmax(outputs[0], dim=-1) | |
predictions.append(pt_predictions.argmax(-1).item()) # warning, don't append the tensor too memory intensive! | |
print(pt_predictions) | |
precision, recall, f1, _ = precision_recall_fscore_support( | |
df_labels.iloc[:, 0], predictions, average="binary" | |
) | |
acc = accuracy_score(df_labels.iloc[:, 0], predictions) | |
metrics = {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall} | |
print(metrics) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment