Skip to content

Instantly share code, notes, and snippets.

@siddhsql
Created May 31, 2023 18:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save siddhsql/3e12570f2352b706ec147fb57431feb9 to your computer and use it in GitHub Desktop.
Save siddhsql/3e12570f2352b706ec147fb57431feb9 to your computer and use it in GitHub Desktop.
text classification using transformers and pytorch
import torch
from torch import nn
from transformers import AutoTokenizer, AutoConfig, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
class Embeddings(nn.Module):
def __init__(self, config):
super().__init__()
self.token_embeddings = nn.Embedding(config.vocab_size, # 30522
config.hidden_size) # 768
self.position_embeddings = nn.Embedding(config.max_position_embeddings, # 512
config.hidden_size) # 768
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) # 1e-12
self.dropout = nn.Dropout(config.hidden_dropout_prob) # 0.1. in original code sample no argument is given, making the prob. 0.5
def forward(self, input_ids):
seq_length = input_ids.size(1) # get the number of columns in given matrix
position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0) # vector of numbers [0,1,2,...,seq_length - 1]
token_embeddings = self.token_embeddings(input_ids) # 64x85x768 tensor. each token is mapped to a 768 length real vector
position_embeddings = self.position_embeddings(position_ids) # 64x85x768 tensor.
embeddings = token_embeddings + position_embeddings
embeddings = self.layer_norm(embeddings) # 64x85x768 tensor.
embeddings = self.dropout(embeddings) # will randomly zero out some of the responses
return embeddings
class TransformerForSequenceClassification(nn.Module):
def __init__(self, config, num_labels):
super().__init__()
self.embeddings = Embeddings(config)
encoder_layer = nn.TransformerEncoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads) # 12
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.num_hidden_layers) # 12
self.dropout = nn.Dropout(config.hidden_dropout_prob) # 0.1
self.classifier = nn.Linear(config.hidden_size, num_labels) # create a linear transformation that can transform a 768-d vector to 6-d vector
def forward(self, input_ids, **kwargs):
x = self.embeddings(input_ids) # will apply the forward function to input_ids. each input_id ranges from [0, vocab_size)
x = self.encoder(x)[:, 0, :] # select hidden state of [CLS] token. this is commonly used as the feature for classification task.
x = self.dropout(x) # 10% of entries in the matrix will be zeroed out. this is a regularization technique used to prevent overfitting.
x = self.classifier(x) # 64x6 matrix. the classifier takes the 768 feature vector and transforms it into logits via a matrix multiplication.
return x # x is a matrix of logits for each of the label. logit(p) = log(p / (1 - p)). each label has a non-zero prob.
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
f1 = f1_score(labels, preds, average="weighted")
acc = accuracy_score(labels, preds)
return { "accuracy": acc, "f1": f1 }
def create_training_args(emotions_encoded):
batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
return TrainingArguments(output_dir='out',
num_train_epochs=2,
learning_rate=1e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
evaluation_strategy="epoch",
disable_tqdm=False,
logging_steps=logging_steps,
push_to_hub=False,
log_level="error")
if __name__ == "__main__":
def tokenize(batch):
return tokenizer(batch["text"], padding=True, truncation=True, return_tensors="pt", add_special_tokens=False)
emotions = load_dataset('emotion') # https://huggingface.co/docs/datasets/v1.1.3/loading_datasets.html
model_ckpt = "bert-base-uncased"
config = AutoConfig.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
transformer = TransformerForSequenceClassification(config, 6) # there are 6 labels in the emotion dataset
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
training_args = create_training_args(emotions_encoded)
trainer = Trainer(model=transformer,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=emotions_encoded["train"],
eval_dataset=emotions_encoded["validation"]) # we are not passing any tokenizer as we have already tokenized the input
trainer.train()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment