Created
May 31, 2023 18:29
-
-
Save siddhsql/3e12570f2352b706ec147fb57431feb9 to your computer and use it in GitHub Desktop.
text classification using transformers and pytorch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from torch import nn | |
from transformers import AutoTokenizer, AutoConfig, Trainer, TrainingArguments | |
from datasets import load_dataset | |
from sklearn.metrics import accuracy_score, f1_score | |
class Embeddings(nn.Module): | |
def __init__(self, config): | |
super().__init__() | |
self.token_embeddings = nn.Embedding(config.vocab_size, # 30522 | |
config.hidden_size) # 768 | |
self.position_embeddings = nn.Embedding(config.max_position_embeddings, # 512 | |
config.hidden_size) # 768 | |
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) # 1e-12 | |
self.dropout = nn.Dropout(config.hidden_dropout_prob) # 0.1. in original code sample no argument is given, making the prob. 0.5 | |
def forward(self, input_ids): | |
seq_length = input_ids.size(1) # get the number of columns in given matrix | |
position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0) # vector of numbers [0,1,2,...,seq_length - 1] | |
token_embeddings = self.token_embeddings(input_ids) # 64x85x768 tensor. each token is mapped to a 768 length real vector | |
position_embeddings = self.position_embeddings(position_ids) # 64x85x768 tensor. | |
embeddings = token_embeddings + position_embeddings | |
embeddings = self.layer_norm(embeddings) # 64x85x768 tensor. | |
embeddings = self.dropout(embeddings) # will randomly zero out some of the responses | |
return embeddings | |
class TransformerForSequenceClassification(nn.Module): | |
def __init__(self, config, num_labels): | |
super().__init__() | |
self.embeddings = Embeddings(config) | |
encoder_layer = nn.TransformerEncoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads) # 12 | |
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.num_hidden_layers) # 12 | |
self.dropout = nn.Dropout(config.hidden_dropout_prob) # 0.1 | |
self.classifier = nn.Linear(config.hidden_size, num_labels) # create a linear transformation that can transform a 768-d vector to 6-d vector | |
def forward(self, input_ids, **kwargs): | |
x = self.embeddings(input_ids) # will apply the forward function to input_ids. each input_id ranges from [0, vocab_size) | |
x = self.encoder(x)[:, 0, :] # select hidden state of [CLS] token. this is commonly used as the feature for classification task. | |
x = self.dropout(x) # 10% of entries in the matrix will be zeroed out. this is a regularization technique used to prevent overfitting. | |
x = self.classifier(x) # 64x6 matrix. the classifier takes the 768 feature vector and transforms it into logits via a matrix multiplication. | |
return x # x is a matrix of logits for each of the label. logit(p) = log(p / (1 - p)). each label has a non-zero prob. | |
def compute_metrics(pred): | |
labels = pred.label_ids | |
preds = pred.predictions.argmax(-1) | |
f1 = f1_score(labels, preds, average="weighted") | |
acc = accuracy_score(labels, preds) | |
return { "accuracy": acc, "f1": f1 } | |
def create_training_args(emotions_encoded): | |
batch_size = 64 | |
logging_steps = len(emotions_encoded["train"]) // batch_size | |
return TrainingArguments(output_dir='out', | |
num_train_epochs=2, | |
learning_rate=1e-5, | |
per_device_train_batch_size=batch_size, | |
per_device_eval_batch_size=batch_size, | |
weight_decay=0.01, | |
evaluation_strategy="epoch", | |
disable_tqdm=False, | |
logging_steps=logging_steps, | |
push_to_hub=False, | |
log_level="error") | |
if __name__ == "__main__": | |
def tokenize(batch): | |
return tokenizer(batch["text"], padding=True, truncation=True, return_tensors="pt", add_special_tokens=False) | |
emotions = load_dataset('emotion') # https://huggingface.co/docs/datasets/v1.1.3/loading_datasets.html | |
model_ckpt = "bert-base-uncased" | |
config = AutoConfig.from_pretrained(model_ckpt) | |
tokenizer = AutoTokenizer.from_pretrained(model_ckpt) | |
transformer = TransformerForSequenceClassification(config, 6) # there are 6 labels in the emotion dataset | |
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None) | |
training_args = create_training_args(emotions_encoded) | |
trainer = Trainer(model=transformer, | |
args=training_args, | |
compute_metrics=compute_metrics, | |
train_dataset=emotions_encoded["train"], | |
eval_dataset=emotions_encoded["validation"]) # we are not passing any tokenizer as we have already tokenized the input | |
trainer.train() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment