Created May 31, 2023 18:29
text classification using transformers and pytorch
import torch
from torch import nn
from transformers import AutoTokenizer, AutoConfig, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
class Embeddings(nn.Module):
def __init__(self, config):
self.token_embeddings = nn.Embedding(config.vocab_size, # 30522
config.hidden_size) # 768
self.position_embeddings = nn.Embedding(config.max_position_embeddings, # 512
config.hidden_size) # 768
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) # 1e-12
self.dropout = nn.Dropout(config.hidden_dropout_prob) # 0.1. in original code sample no argument is given, making the prob. 0.5
def forward(self, input_ids):
seq_length = input_ids.size(1) # get the number of columns in given matrix
position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0) # vector of numbers [0,1,2,...,seq_length - 1]
token_embeddings = self.token_embeddings(input_ids) # 64x85x768 tensor. each token is mapped to a 768 length real vector
position_embeddings = self.position_embeddings(position_ids) # 64x85x768 tensor.
embeddings = token_embeddings + position_embeddings
embeddings = self.layer_norm(embeddings) # 64x85x768 tensor.
embeddings = self.dropout(embeddings) # will randomly zero out some of the responses
return embeddings
class TransformerForSequenceClassification(nn.Module):
def __init__(self, config, num_labels):
self.embeddings = Embeddings(config)
encoder_layer = nn.TransformerEncoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads) # 12
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.num_hidden_layers) # 12
self.dropout = nn.Dropout(config.hidden_dropout_prob) # 0.1
self.classifier = nn.Linear(config.hidden_size, num_labels) # create a linear transformation that can transform a 768-d vector to 6-d vector
def forward(self, input_ids, **kwargs):
x = self.embeddings(input_ids) # will apply the forward function to input_ids. each input_id ranges from [0, vocab_size)
x = self.encoder(x)[:, 0, :] # select hidden state of [CLS] token. this is commonly used as the feature for classification task.
x = self.dropout(x) # 10% of entries in the matrix will be zeroed out. this is a regularization technique used to prevent overfitting.
x = self.classifier(x) # 64x6 matrix. the classifier takes the 768 feature vector and transforms it into logits via a matrix multiplication.
return x # x is a matrix of logits for each of the label. logit(p) = log(p / (1 - p)). each label has a non-zero prob.
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
f1 = f1_score(labels, preds, average="weighted")
acc = accuracy_score(labels, preds)
return { "accuracy": acc, "f1": f1 }
def create_training_args(emotions_encoded):
batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
return TrainingArguments(output_dir='out',
if __name__ == "__main__":
def tokenize(batch):
return tokenizer(batch["text"], padding=True, truncation=True, return_tensors="pt", add_special_tokens=False)
emotions = load_dataset('emotion') #
model_ckpt = "bert-base-uncased"
config = AutoConfig.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
transformer = TransformerForSequenceClassification(config, 6) # there are 6 labels in the emotion dataset
emotions_encoded =, batched=True, batch_size=None)
training_args = create_training_args(emotions_encoded)
trainer = Trainer(model=transformer,
eval_dataset=emotions_encoded["validation"]) # we are not passing any tokenizer as we have already tokenized the input
