Skip to content

Instantly share code, notes, and snippets.

@Dref360
Last active June 29, 2023 14:11
Show Gist options
  • Save Dref360/6b533ab664144cd4746f9f7e61b19fd4 to your computer and use it in GitHub Desktop.
Save Dref360/6b533ab664144cd4746f9f7e61b19fd4 to your computer and use it in GitHub Desktop.
Example on how to use Baal for NER usecases using HuggingFace.
from datasets import load_dataset
from transformers import pipeline, DataCollatorForTokenClassification
from baal.active.active_loop import ActiveLearningLoop
from baal.active.dataset import ActiveLearningDataset
from baal.active.heuristics import BALD
from baal.bayesian.dropout import patch_module
from baal.transformers_trainer_wrapper import BaalTransformersTrainer
dataset = load_dataset("conll2003")
pipeline = pipeline('ner', model='issifuamajeed/distilbert-base-uncased-finetuned-ner')
tokenizer = pipeline.tokenizer
tokenizer.model_max_length = 150
def align_labels_with_tokens(labels, word_ids):
new_labels = []
current_word = None
for word_id in word_ids:
if word_id != current_word:
# Start of a new word!
current_word = word_id
label = -100 if word_id is None else labels[word_id]
new_labels.append(label)
elif word_id is None:
# Special token
new_labels.append(-100)
else:
# Same word as previous token
label = labels[word_id]
# If the label is B-XXX we change it to I-XXX
if label % 2 == 1:
label += 1
new_labels.append(label)
return new_labels
# Tokenize dataset
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(
examples["tokens"], truncation=True, is_split_into_words=True, padding='max_length'
)
all_labels = examples["ner_tags"]
new_labels = []
for i, labels in enumerate(all_labels):
word_ids = tokenized_inputs.word_ids(i)
new_labels.append(align_labels_with_tokens(labels, word_ids))
tokenized_inputs["labels"] = new_labels
return tokenized_inputs
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True,
remove_columns=dataset["train"].column_names, )
# BAAL: Setup ALDataset and label 100 examples.
al_dataset = ActiveLearningDataset(dataset=tokenized_dataset['train'])
al_dataset.label_randomly(100)
# Apply MC-Dropout, create trainer and loop objects
model = patch_module(pipeline.model)
init_weights = model.state_dict()
trainer = BaalTransformersTrainer(model=model,
train_dataset=al_dataset,
eval_dataset=tokenized_dataset['validation'],
data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
tokenizer=tokenizer)
loop = ActiveLearningLoop(dataset=al_dataset,
get_probabilities=trainer.predict_on_dataset,
heuristic=BALD(reduction='sum'), query_size=100)
"""Prediction piece"""
# Shape [Batch_size, Num-Tokens, Probabilities, Iterations]
predictions = trainer.predict_on_dataset(tokenized_dataset['test'], iterations=10)
# Predictions with Class first [batch_size, Probabilities, Num Tokens, Iteration]
next_to_label = BALD(reduction='sum')(predictions.swapaxes(1, 2))
uncertainties = BALD().get_uncertainties(predictions.swapaxes(1, 2))
"""Training Piece"""
for _ in range(2):
trainer.load_state_dict(init_weights)
print(f"Active learning: labelled={al_dataset.n_labelled} unlabelled={al_dataset.n_unlabelled}")
trainer.train()
trainer.lr_scheduler = None
trainer.evaluate()
loop.step()
@ayushkm2799
Copy link

ayushkm2799 commented Jun 15, 2023

hi @Dref360,

I already tried this thing but I am getting this error same error comes in your code as well.

Exception has occurred: AttributeError (note: full exception trace is shown but execution is paused at: )
'size'
KeyError: 'size'

During handling of the above exception, another exception occurred:

File "/home/ubuntu/ayuskm/Active_Learning/baal/baal/utils/array_utils.py", line 40, in stack_in_memory
input_shape = data.size()
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal/transformers_trainer_wrapper.py", line 74, in
lambda element: map_on_tensor(lambda d: stack_in_memory(d, iterations), element),
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal/utils/iterutils.py", line 10, in map_on_tensor
return fn(val)
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal/transformers_trainer_wrapper.py", line 74, in
lambda element: map_on_tensor(lambda d: stack_in_memory(d, iterations), element),
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal/utils/iterutils.py", line 10, in map_on_tensor
return fn(val)
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal/transformers_trainer_wrapper.py", line 73, in predict_on_dataset_generator
inputs = map_on_tensor(
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal/transformers_trainer_wrapper.py", line 111, in predict_on_dataset
preds = list(
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal1.py", line 82, in (Current frame)
predictions = trainer.predict_on_dataset(tokenized_dataset['test'], iterations=10)
AttributeError:

@Dref360
Copy link
Author

Dref360 commented Jun 15, 2023

Are you sure you are using the branch feat/handle_ner @ayushkm2799 ?

I updated map_on_tensor to handle this issue specifically.
https://github.com/baal-org/baal/pull/263/files#diff-a5206cabfdb30f2ab85f9320276e3cbcbb0c86cfe5f4fcd03a94ca63721b6d91L4

You can also book a meeting on Calendly so that we can debug the issue together.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment