Skip to content

Instantly share code, notes, and snippets.

@lppier
Created December 27, 2020 06:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lppier/0f10d3a9d13c76c24f65a77b3d02b76a to your computer and use it in GitHub Desktop.
Save lppier/0f10d3a9d13c76c24f65a77b3d02b76a to your computer and use it in GitHub Desktop.
Fine-tuning for HuggingFace Sentiment Classifier https://lppier.github.io/NLP-PK!-HuggingFace-vs-AWS-ML-Services
from pathlib import Path
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
import torch
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# IMDB Dataset can be found here
# wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# tar -xf aclImdb_v1.tar.gz
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, preds, average="binary"
)
acc = accuracy_score(labels, preds)
return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
def read_imdb_split(split_dir):
split_dir = Path(split_dir)
texts = []
labels = []
for label_dir in ["pos", "neg"]:
for text_file in (split_dir / label_dir).iterdir():
texts.append(text_file.read_text(encoding="utf8"))
labels.append(0 if label_dir is "neg" else 1)
return texts, labels
train_texts, train_labels = read_imdb_split("data/aclImdb/train")
test_texts, test_labels = read_imdb_split("data/aclImdb/test")
# Further split training set to get a validation set
train_texts, val_texts, train_labels, val_labels = train_test_split(
train_texts, train_labels, test_size=0.1
)
# Get BERT Tokens
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
class IMDbDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item["labels"] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)
training_args = TrainingArguments(
output_dir="./results", # output directory
num_train_epochs=1, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir="./logs", # directory for storing logs
logging_steps=10,
)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset, # evaluation dataset
compute_metrics=compute_metrics,
)
trainer.train() # this saves the models in the checkpoints under results folder
# Inference Code
import pandas as pd
model = DistilBertForSequenceClassification.from_pretrained("./results_old/checkpoint-3500")
df_labels = pd.read_csv("data/comprehendimdbtest.csv", header=None)
test_data = df_labels.iloc[:,1].to_list()
predictions = []
# Need this loop otherwise my cpu memory just loads up
for text in test_data:
encoding = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
outputs = model(**encoding)
pt_predictions = F.softmax(outputs[0], dim=-1)
predictions.append(pt_predictions.argmax(-1).item()) # warning, don't append the tensor too memory intensive!
print(pt_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(
df_labels.iloc[:, 0], predictions, average="binary"
)
acc = accuracy_score(df_labels.iloc[:, 0], predictions)
metrics = {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
print(metrics)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment