Use Baal in production (Text classification)¶
In this tutorial, we will show you how to use Baal during your labeling task.
NOTE In this tutorial, we assume that we do not know the labels!
import os
import random
from copy import deepcopy
import numpy as np
import torch.backends
import transformers
# These packages are optional and not needed for BaaL main package.
# You can have access to `datasets` and `transformers` if you install
# BaaL with --dev setup.
from datasets import load_dataset
from tqdm import tqdm
from transformers import BertForSequenceClassification
from transformers import BertTokenizer, TrainingArguments
from transformers import set_seed
# Only warnings for HF
transformers.utils.logging.set_verbosity_warning()
from baal.active import get_heuristic
from baal.active.dataset.nlp_datasets import (
active_huggingface_dataset,
HuggingFaceDatasets,
)
from baal.bayesian.dropout import patch_module
from baal.transformers_trainer_wrapper import BaalTransformersTrainer
from typing import List
random.seed(1337)
torch.manual_seed(1337)
# Set tranformer seed to ensure that initial weights are identical
set_seed(101)
Information on the hyperparms below
epoch
: Number of times you want to run and AL loopbatch_size
: The train and eval batch size for hf trainer argumentsmodel
: Hugging Face Modelquery_size
: Number of samples you want to query at each AL iteration for labellingheuristic
: The acquisition function/heuristic based on which you want to query the important samplesiterations
: The number of iterations you want to run for MCdropout to find the uncertanitiesshuffle_prop
: Additional Noise to counter selection biaslearning_epoch
: Traing epochs for hugging face trainer
hyperparams = {
"epoch": 10,
"batch_size": 4,
"model": "bert-base-uncased",
"query_size": 5,
"heuristic": "bald",
"iterations": 15,
"shuffle_prop": 0.05,
"learning_epoch": 3,
}
# Check for CUDA
use_cuda = torch.cuda.is_available()
torch.backends.cudnn.benchmark = True
# Load Model
hf_model = BertForSequenceClassification.from_pretrained(
pretrained_model_name_or_path=hyperparams["model"],
force_download=True,
num_labels=4,
)
# Setup tokenizer for model
tokenizer = BertTokenizer.from_pretrained(
pretrained_model_name_or_path=hyperparams["model"]
)
# Enable dropouts for predictions
hf_model = patch_module(hf_model)
# Send model to device and setup cuda arguments
if use_cuda:
hf_model.to("cuda:0")
no_cuda = False
else:
hf_model.to("cpu")
no_cuda = True
We are using the dataset from Tweet Eval Hugging Face. To mimic a production setting where we might need to load data from an external source, we have made a copy of this data to .csv
files. Of couse you can modify this based on the data source of your data.
There could be different scenarios for your train, test and validation datasets. Especially if you are collecting data and evaluating model simulataneously, offered in this tutorial are 2 functions which can help you deal with most of the situations
We will go over a scenario where we assume that you have some labelled train data but you still want a human oracle to label a few more samples. For simiplicity we will assume that validation and test sets are completely labelled. In a real world scenario if this is not the case for validation set. You can just re-purpose the train_labeller
to label the validation set or do it manually if you want.
def clip_data(split, percent=10):
"""
Retrun a reduced dataset for CPU demos
"""
start_idx = round((percent / 100) * len(dataset_emo[split]))
exclude_idx = [i for i in range(start_idx, len(dataset_emo[split]))]
# create new dataset exluding those idx
dataset = dataset_emo[split].select(
(i for i in range(len(dataset_emo[split])) if i not in set(exclude_idx))
)
return dataset
# Define labels in your dataset
label_list = [0, 1, 2, 3]
# Load data from files
dataset_emo = load_dataset("tweet_eval", "emotion")
# Reduce dataset size to 10% for CPU
if not use_cuda:
print(
"Complete dataset processing will take ages to run, clipping data just for demo on CPU"
)
raw_train_set = clip_data("train", 10)
raw_valid_set = clip_data("validation", 10)
else:
raw_train_set = dataset_emo["train"]
raw_valid_set = dataset_emo["validation"]
def get_label_from_data(active_dataset, indexes) -> List[int]:
"""
Get labels from the active dataset, this assumes that you have
already labelled some samples in your initial dataset
Args:
----
active_dataset : Active dataset which consists of train and pool
indexes : Indexes of the points for which labels are to be fetched
from the data
Returns:
----
labels: Returns the corresponding labels
"""
labels = []
# Now since you labelled points earlier now some part of pool has become train
# so in order to get the pool indexes based on your 'original' data i.e
# your raw_train_set. Make sure to user __pool_tp
raw_data_idx = active_dataset._pool_to_oracle_index(indexes)
for idx in raw_data_idx:
print(f"Adding labels for Raw data Index {idx} : {raw_train_set['text'][idx]}")
print("\n")
label = raw_train_set["label"][idx]
labels.append(label)
print("\n")
return labels
def get_label_human_oracle(active_dataset, indexes) -> List[int]:
"""
Get labels from human oracle. During the AL loop some samples
will go to the human labeller
Args:
----
active_dataset : Active dataset which consists of train and pool
indexes : Indexes of the points for which labels are to be fetched
from the data
Returns:
----
labels: Returns the corresponding labels
"""
# List for corresponding labels
labels = []
skipped = []
print(" 0: anger , 1: joy, 2: optimism, 3: sadness")
for sample_idx, idx in enumerate(indexes):
while True:
try:
label = int(
input(
f"Pool Index {idx} : {active_dataset.pool.__getitem__(idx)['inputs']}"
)
)
except ValueError:
print("Sorry, I didn't understand that.")
continue
if label != -1 and label not in label_list:
print(f"Allowed labels are {label_list}")
continue
if label == -1:
print("Skipping this sample")
skipped.append(sample_idx)
break
else:
labels.append(label)
break
print("\n")
indexes_upd = np.delete(indexes, skipped)
return labels, indexes_upd
Some inituion on how active_huggingface_dataset
looks at your data. Once you convert your data to an active_huggingface_dataset
irrespective of the labels provided the complete dataset is considered as pool. If you have some already labelled points or you want to label some points then you will need to explicilty tell this to your active_huggingface_dataset
. The above functions are get_label_from_data
and get_label_human_oracle
are provided for that specific purpose.
In our scenario we assume that we have indexes 28 points for which we already have the label and we have 2 points which we want the human oracle to label.
Then get_label_from_data
called with the active dataset and the indexes of the points for which you already have labels for will return a indexes of samples and the corresponding labels
Calling get_label_human_oracle
with the active dataset and the indexes of the points will prompt an input from the human oracle and again returns indexes of samples and corresponding labels. Note that you can pass -1
irrespective of your actual labels to skip a certain sample which you are still unsure about.
Once we have labels from these functions we can call the label
method on our active_huggingface_dataset
with the all indexes and the corresponding labels.
NOTE: Make sure before calling the label
method on our active_huggingface_dataset
you have set active_set.can_label = True
. This will ensure that the dataset can be labelled.
Once the samples are labelled in the active_set you can see that the pool length will decrease by the number of labelled samples, now you have a train and pool set.
# Suppose now you have 30 indexes from train setwhich are either to be labelled or have
# existing labels. Make sure replace=False
point_idx_train = np.random.choice(len(raw_train_set) - 1, 30, replace=False)
# There are points for which labels are available
points_to_label_dataset = point_idx_train[:28]
# These are points which will need to be manually labelled by a human oracle
points_to_label_oracle = point_idx_train[-2:]
# Convert your dataset into an active learning dataset
active_set = active_huggingface_dataset(raw_train_set, tokenizer, input_key="text")
# Allow your active set to be labelled, without this you can't label the active set
active_set.can_label = True
# Now once your dataset is converted into an active dataset, the active dataset
# assumes all your points are part of the pool set and are unlabelled. Even
# if you have a label in the dataset for them
assert len(active_set.pool) == len(raw_train_set)
# Label points using data that you have
label_from_data = get_label_from_data(active_set, points_to_label_dataset)
# Label points directly using human oracle
label_from_oracle, points_to_label_oracle = get_label_human_oracle(
active_set, points_to_label_oracle
)
# Label active dataset
active_set.label(
np.append(points_to_label_dataset, points_to_label_oracle),
label_from_data + label_from_oracle,
)
print(f"Length of active pool is now", len(active_set.pool))
assert len(active_set.pool) == len(raw_train_set) - len(points_to_label_oracle) - len(
points_to_label_dataset
)
# Setup validation set, in case you do not have labels for validation set you can use the above approaches
# to get one from an oracle or some dataset
valid_set = HuggingFaceDatasets(raw_valid_set, tokenizer, input_key="text")
active_set, test_set = active_set, valid_set
def save_model(trainer):
"""Save your model"""
trainer.save_model(os.path.join(os.getcwd(), "model"))
In the code below you setup up the arguments for the hugging face trainer just like you would when you try to fine tune a hugging face model.
Additionally this time the trainer is BaalTransformersTrainer
instead of the traiditional Trainer
offered by hugging face. This baal trainer will keep track of your active learning loop.
# Setup Heuristics
heuristic = get_heuristic(
hyperparams["heuristic"], hyperparams["shuffle_prop"]
)
# Model save checkpoint
save_checkpoint = 2
# Keep track of initial model weights
init_weights = deepcopy(hf_model.state_dict())
training_args = TrainingArguments(
output_dir=".",
num_train_epochs=hyperparams["learning_epoch"],
per_device_train_batch_size=hyperparams["batch_size"],
per_device_eval_batch_size=hyperparams["batch_size"],
weight_decay=0.01,
logging_dir=".",
no_cuda=no_cuda,
save_total_limit=1,
)
# Active Learning Trainer Wrapper
baal_trainer = BaalTransformersTrainer(
model=hf_model,
args=training_args,
train_dataset=active_set,
eval_dataset=test_set,
tokenizer=None,
)
logs = {}
logs["epoch"] = 0
The active learning loop below works as follows:
- Train model on whatever intial train data we had gathered earlier in our active dataset
- Evaluate your model on a seperate evaluation set
- Make predictions with dropouts enabled(MCdropout) to gather uncertanities for your pool samples in the active set and use and acqusition function to get the most "important" samples for the human oracle
- Once samples have been labelled by human oracle, call the
label
method of the active dataset which will label the samples. Note, as you might have noticed earlier the active dataset makes sure that once samples are labelled they are removed from the pool and moved to train in the active dataset - Save model if needed
- Now finally we load the intial weights of the model back, so that when the active learning loop runs again next time your model is fine tuned based on the intial train data + the new train data which was added in the current active learning loop.
Repeat until some stopping criterion is reached.
for epoch in tqdm(range(hyperparams["epoch"])):
# we use the default setup of HuggingFace for training (ex: epoch=1).
# The setup is adjustable when BaalHuggingFaceTrainer is defined.
baal_trainer.train()
print("\n")
# Validation!
eval_metrics = baal_trainer.evaluate()
print("\n")
# MCdropout to gather uncertanities
predictions = baal_trainer.predict_on_dataset(
active_set.pool, iterations=hyperparams["iterations"]
)
print("\n")
# Acquistion of the most informative and diverse samples based on BatchBALD
top_uncertainty = heuristic(predictions)[: hyperparams.get("query_size", 1)]
# Send the samples for labelling from human oracle
label_from_oracle, points_to_label_oracle = get_label_human_oracle(
active_set, top_uncertainty
)
# Label active dataset
active_set.label(points_to_label_oracle, label_from_oracle)
# Save model
if epoch == save_checkpoint:
save_model(baal_trainer)
# We reset the model weights to relearn from the new trainset.
baal_trainer.load_state_dict(init_weights)
baal_trainer.lr_scheduler = None
active_logs = {
"epoch": epoch,
"labeled_data": active_set.labelled_map,
"Next Training set size": len(active_set),
}
logs = {**eval_metrics, **active_logs}
Now you might want to take this labelled data and maybe analyze it or maybe fine tune a different model based on this. Some useful utilities for that
active_set._dataset
: Provides access to all the dataactive_set.is_labelled(idx)
: Lets you know if a sample atidx
is labelled via the Active learning process or notactive_set.labelled
: A bool numpy array which keeps a record of which samples have been labelled in the AL process.
Using these you can easily pull your labelled data should the need arise.