Use Baal in production (Text classification)¶

In this tutorial, we will show you how to use Baal during your labeling task.

NOTE In this tutorial, we assume that we do not know the labels!

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
import os
import random
from copy import deepcopy

import numpy as np
import torch.backends
import transformers
# These packages are optional and not needed for BaaL main package.
# You can have access to `datasets` and `transformers` if you install
# BaaL with --dev setup.
from datasets import load_dataset
from tqdm import tqdm
from transformers import BertForSequenceClassification
from transformers import BertTokenizer, TrainingArguments
from transformers import set_seed

# Only warnings for HF
transformers.utils.logging.set_verbosity_warning()

from baal.active import get_heuristic
from baal.active.dataset.nlp_datasets import (
    active_huggingface_dataset,
    HuggingFaceDatasets,
)
from baal.bayesian.dropout import patch_module
from baal.transformers_trainer_wrapper import BaalTransformersTrainer

from typing import List

random.seed(1337)
torch.manual_seed(1337)

# Set tranformer seed to ensure that initial weights are identical
set_seed(101)

import os
import random
from copy import deepcopy

import numpy as np
import torch.backends
import transformers
# These packages are optional and not needed for BaaL main package.
# You can have access to `datasets` and `transformers` if you install
# BaaL with --dev setup.
from datasets import load_dataset
from tqdm import tqdm
from transformers import BertForSequenceClassification
from transformers import BertTokenizer, TrainingArguments
from transformers import set_seed

# Only warnings for HF
transformers.utils.logging.set_verbosity_warning()

from baal.active import get_heuristic
from baal.active.dataset.nlp_datasets import (
    active_huggingface_dataset,
    HuggingFaceDatasets,
)
from baal.bayesian.dropout import patch_module
from baal.transformers_trainer_wrapper import BaalTransformersTrainer

from typing import List

random.seed(1337)
torch.manual_seed(1337)

# Set tranformer seed to ensure that initial weights are identical
set_seed(101)

Information on the hyperparms below

epoch: Number of times you want to run and AL loop
batch_size: The train and eval batch size for hf trainer arguments
model: Hugging Face Model
query_size: Number of samples you want to query at each AL iteration for labelling
heuristic: The acquisition function/heuristic based on which you want to query the important samples
iterations: The number of iterations you want to run for MCdropout to find the uncertanities
shuffle_prop: Additional Noise to counter selection bias
learning_epoch: Traing epochs for hugging face trainer

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
hyperparams = {
    "epoch": 10,
    "batch_size": 4,
    "model": "bert-base-uncased",
    "query_size": 5,
    "heuristic": "bald",
    "iterations": 15,
    "shuffle_prop": 0.05,
    "learning_epoch": 3,
}
hyperparams = {
    "epoch": 10,
    "batch_size": 4,
    "model": "bert-base-uncased",
    "query_size": 5,
    "heuristic": "bald",
    "iterations": 15,
    "shuffle_prop": 0.05,
    "learning_epoch": 3,
}

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
# Check for CUDA
use_cuda = torch.cuda.is_available()
torch.backends.cudnn.benchmark = True

# Load Model
hf_model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=hyperparams["model"],
    force_download=True,
    num_labels=4,
)

# Setup tokenizer for model
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path=hyperparams["model"]
)

# Enable dropouts for predictions
hf_model = patch_module(hf_model)

# Send model to device and setup cuda arguments
if use_cuda:
    hf_model.to("cuda:0")
    no_cuda = False
else:
    hf_model.to("cpu")
    no_cuda = True
# Check for CUDA
use_cuda = torch.cuda.is_available()
torch.backends.cudnn.benchmark = True

# Load Model
hf_model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=hyperparams["model"],
    force_download=True,
    num_labels=4,
)

# Setup tokenizer for model
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path=hyperparams["model"]
)

# Enable dropouts for predictions
hf_model = patch_module(hf_model)

# Send model to device and setup cuda arguments
if use_cuda:
    hf_model.to("cuda:0")
    no_cuda = False
else:
    hf_model.to("cpu")
    no_cuda = True

We are using the dataset from Tweet Eval Hugging Face. To mimic a production setting where we might need to load data from an external source, we have made a copy of this data to .csv files. Of couse you can modify this based on the data source of your data.

There could be different scenarios for your train, test and validation datasets. Especially if you are collecting data and evaluating model simulataneously, offered in this tutorial are 2 functions which can help you deal with most of the situations

We will go over a scenario where we assume that you have some labelled train data but you still want a human oracle to label a few more samples. For simiplicity we will assume that validation and test sets are completely labelled. In a real world scenario if this is not the case for validation set. You can just re-purpose the train_labeller to label the validation set or do it manually if you want.

In [ ]:

            
                Copied!
                
def clip_data(split, percent=10):
    """
    Retrun a reduced dataset for CPU demos
    """

    start_idx = round((percent / 100) * len(dataset_emo[split]))

    exclude_idx = [i for i in range(start_idx, len(dataset_emo[split]))]

    # create new dataset exluding those idx
    dataset = dataset_emo[split].select(
        (i for i in range(len(dataset_emo[split])) if i not in set(exclude_idx))
    )

    return dataset
def clip_data(split, percent=10):
    """
    Retrun a reduced dataset for CPU demos
    """

    start_idx = round((percent / 100) * len(dataset_emo[split]))

    exclude_idx = [i for i in range(start_idx, len(dataset_emo[split]))]

    # create new dataset exluding those idx
    dataset = dataset_emo[split].select(
        (i for i in range(len(dataset_emo[split])) if i not in set(exclude_idx))
    )

    return dataset

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
# Define labels in your dataset
label_list = [0, 1, 2, 3]

# Load data from files
dataset_emo = load_dataset("tweet_eval", "emotion")

# Reduce dataset size to 10% for CPU
if not use_cuda:
    print(
        "Complete dataset processing will take ages to run, clipping data just for demo on CPU"
    )
    raw_train_set = clip_data("train", 10)
    raw_valid_set = clip_data("validation", 10)
else:
    raw_train_set = dataset_emo["train"]
    raw_valid_set = dataset_emo["validation"]
# Define labels in your dataset
label_list = [0, 1, 2, 3]

# Load data from files
dataset_emo = load_dataset("tweet_eval", "emotion")

# Reduce dataset size to 10% for CPU
if not use_cuda:
    print(
        "Complete dataset processing will take ages to run, clipping data just for demo on CPU"
    )
    raw_train_set = clip_data("train", 10)
    raw_valid_set = clip_data("validation", 10)
else:
    raw_train_set = dataset_emo["train"]
    raw_valid_set = dataset_emo["validation"]

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
def get_label_from_data(active_dataset, indexes) -> List[int]:
    """
    Get labels from the active dataset, this assumes that you have
    already labelled some samples in your initial dataset

    Args:
    ----
    active_dataset : Active dataset which consists of train and pool

    indexes : Indexes of the points for which labels are to be fetched
    from the data

    Returns:
    ----
    labels: Returns the corresponding labels

    """

    labels = []

    # Now since you labelled points earlier now some part of pool has become train
    # so in order to get the pool indexes based on your 'original' data i.e
    # your raw_train_set. Make sure to user __pool_tp

    raw_data_idx = active_dataset._pool_to_oracle_index(indexes)

    for idx in raw_data_idx:
        print(f"Adding labels for Raw data Index {idx} : {raw_train_set['text'][idx]}")

        print("\n")
        label = raw_train_set["label"][idx]
        labels.append(label)
        print("\n")

    return labels
def get_label_from_data(active_dataset, indexes) -> List[int]:
    """
    Get labels from the active dataset, this assumes that you have
    already labelled some samples in your initial dataset

    Args:
    ----
    active_dataset : Active dataset which consists of train and pool

    indexes : Indexes of the points for which labels are to be fetched
    from the data

    Returns:
    ----
    labels: Returns the corresponding labels

    """

    labels = []

    # Now since you labelled points earlier now some part of pool has become train
    # so in order to get the pool indexes based on your 'original' data i.e
    # your raw_train_set. Make sure to user __pool_tp

    raw_data_idx = active_dataset._pool_to_oracle_index(indexes)

    for idx in raw_data_idx:
        print(f"Adding labels for Raw data Index {idx} : {raw_train_set['text'][idx]}")

        print("\n")
        label = raw_train_set["label"][idx]
        labels.append(label)
        print("\n")

    return labels

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
def get_label_human_oracle(active_dataset, indexes) -> List[int]:
    """
    Get labels from human oracle. During the AL loop some samples
    will go to the human labeller

    Args:
    ----
    active_dataset : Active dataset which consists of train and pool

    indexes : Indexes of the points for which labels are to be fetched
    from the data

    Returns:
    ----
    labels: Returns the corresponding labels

    """
    # List for corresponding labels
    labels = []

    skipped = []

    print(" 0: anger , 1: joy, 2: optimism, 3: sadness")

    for sample_idx, idx in enumerate(indexes):

        while True:
            try:
                label = int(
                    input(
                        f"Pool Index {idx} : {active_dataset.pool.__getitem__(idx)['inputs']}"
                    )
                )
            except ValueError:
                print("Sorry, I didn't understand that.")
                continue
            if label != -1 and label not in label_list:
                print(f"Allowed labels are {label_list}")
                continue
            if label == -1:
                print("Skipping this sample")
                skipped.append(sample_idx)
                break
            else:
                labels.append(label)
                break
        print("\n")

    indexes_upd = np.delete(indexes, skipped)

    return labels, indexes_upd
def get_label_human_oracle(active_dataset, indexes) -> List[int]:
    """
    Get labels from human oracle. During the AL loop some samples
    will go to the human labeller

    Args:
    ----
    active_dataset : Active dataset which consists of train and pool

    indexes : Indexes of the points for which labels are to be fetched
    from the data

    Returns:
    ----
    labels: Returns the corresponding labels

    """
    # List for corresponding labels
    labels = []

    skipped = []

    print(" 0: anger , 1: joy, 2: optimism, 3: sadness")

    for sample_idx, idx in enumerate(indexes):

        while True:
            try:
                label = int(
                    input(
                        f"Pool Index {idx} : {active_dataset.pool.__getitem__(idx)['inputs']}"
                    )
                )
            except ValueError:
                print("Sorry, I didn't understand that.")
                continue
            if label != -1 and label not in label_list:
                print(f"Allowed labels are {label_list}")
                continue
            if label == -1:
                print("Skipping this sample")
                skipped.append(sample_idx)
                break
            else:
                labels.append(label)
                break
        print("\n")

    indexes_upd = np.delete(indexes, skipped)

    return labels, indexes_upd

Some inituion on how active_huggingface_dataset looks at your data. Once you convert your data to an active_huggingface_dataset irrespective of the labels provided the complete dataset is considered as pool. If you have some already labelled points or you want to label some points then you will need to explicilty tell this to your active_huggingface_dataset. The above functions are get_label_from_data and get_label_human_oracle are provided for that specific purpose.

In our scenario we assume that we have indexes 28 points for which we already have the label and we have 2 points which we want the human oracle to label.

Then get_label_from_data called with the active dataset and the indexes of the points for which you already have labels for will return a indexes of samples and the corresponding labels

Calling get_label_human_oracle with the active dataset and the indexes of the points will prompt an input from the human oracle and again returns indexes of samples and corresponding labels. Note that you can pass -1 irrespective of your actual labels to skip a certain sample which you are still unsure about.

Once we have labels from these functions we can call the label method on our active_huggingface_dataset with the all indexes and the corresponding labels.

NOTE: Make sure before calling the label method on our active_huggingface_dataset you have set active_set.can_label = True. This will ensure that the dataset can be labelled.

Once the samples are labelled in the active_set you can see that the pool length will decrease by the number of labelled samples, now you have a train and pool set.

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
# Suppose now you have 30 indexes from train setwhich are either to be labelled or have
# existing labels. Make sure replace=False
point_idx_train = np.random.choice(len(raw_train_set) - 1, 30, replace=False)

# There are points for which labels are available
points_to_label_dataset = point_idx_train[:28]

# These are points which will need to be manually labelled by a human oracle
points_to_label_oracle = point_idx_train[-2:]

# Convert your dataset into an active learning dataset
active_set = active_huggingface_dataset(raw_train_set, tokenizer, input_key="text")

# Allow your active set to be labelled, without this you can't label the active set
active_set.can_label = True

# Now once your dataset is converted into an active dataset, the active dataset
# assumes all your points are part of the pool set and are unlabelled. Even
# if you have a label in the dataset for them

assert len(active_set.pool) == len(raw_train_set)

# Label points using data that you have
label_from_data = get_label_from_data(active_set, points_to_label_dataset)

# Label points directly using human oracle
label_from_oracle, points_to_label_oracle = get_label_human_oracle(
    active_set, points_to_label_oracle
)

# Label active dataset
active_set.label(
    np.append(points_to_label_dataset, points_to_label_oracle),
    label_from_data + label_from_oracle,
)

print(f"Length of active pool is now", len(active_set.pool))
assert len(active_set.pool) == len(raw_train_set) - len(points_to_label_oracle) - len(
    points_to_label_dataset
)

# Setup validation set, in case you do not have labels for validation set you can use the above approaches
# to get one from an oracle or some dataset
valid_set = HuggingFaceDatasets(raw_valid_set, tokenizer, input_key="text")

active_set, test_set = active_set, valid_set
# Suppose now you have 30 indexes from train setwhich are either to be labelled or have
# existing labels. Make sure replace=False
point_idx_train = np.random.choice(len(raw_train_set) - 1, 30, replace=False)

# There are points for which labels are available
points_to_label_dataset = point_idx_train[:28]

# These are points which will need to be manually labelled by a human oracle
points_to_label_oracle = point_idx_train[-2:]

# Convert your dataset into an active learning dataset
active_set = active_huggingface_dataset(raw_train_set, tokenizer, input_key="text")

# Allow your active set to be labelled, without this you can't label the active set
active_set.can_label = True

# Now once your dataset is converted into an active dataset, the active dataset
# assumes all your points are part of the pool set and are unlabelled. Even
# if you have a label in the dataset for them

assert len(active_set.pool) == len(raw_train_set)

# Label points using data that you have
label_from_data = get_label_from_data(active_set, points_to_label_dataset)

# Label points directly using human oracle
label_from_oracle, points_to_label_oracle = get_label_human_oracle(
    active_set, points_to_label_oracle
)

# Label active dataset
active_set.label(
    np.append(points_to_label_dataset, points_to_label_oracle),
    label_from_data + label_from_oracle,
)

print(f"Length of active pool is now", len(active_set.pool))
assert len(active_set.pool) == len(raw_train_set) - len(points_to_label_oracle) - len(
    points_to_label_dataset
)

# Setup validation set, in case you do not have labels for validation set you can use the above approaches
# to get one from an oracle or some dataset
valid_set = HuggingFaceDatasets(raw_valid_set, tokenizer, input_key="text")

active_set, test_set = active_set, valid_set

In [ ]:

            
                Copied!
                
def save_model(trainer):
    """Save your model"""
    trainer.save_model(os.path.join(os.getcwd(), "model"))
def save_model(trainer):
    """Save your model"""
    trainer.save_model(os.path.join(os.getcwd(), "model"))

In the code below you setup up the arguments for the hugging face trainer just like you would when you try to fine tune a hugging face model.

Additionally this time the trainer is BaalTransformersTrainer instead of the traiditional Trainer offered by hugging face. This baal trainer will keep track of your active learning loop.

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
# Setup Heuristics
heuristic = get_heuristic(
    hyperparams["heuristic"], hyperparams["shuffle_prop"]
)

# Model save checkpoint
save_checkpoint = 2

# Keep track of initial model weights
init_weights = deepcopy(hf_model.state_dict())

training_args = TrainingArguments(
    output_dir=".",
    num_train_epochs=hyperparams["learning_epoch"],
    per_device_train_batch_size=hyperparams["batch_size"],
    per_device_eval_batch_size=hyperparams["batch_size"],
    weight_decay=0.01,
    logging_dir=".",
    no_cuda=no_cuda,
    save_total_limit=1,
)

# Active Learning Trainer Wrapper
baal_trainer = BaalTransformersTrainer(
    model=hf_model,
    args=training_args,
    train_dataset=active_set,
    eval_dataset=test_set,
    tokenizer=None,
)
logs = {}

logs["epoch"] = 0
# Setup Heuristics
heuristic = get_heuristic(
    hyperparams["heuristic"], hyperparams["shuffle_prop"]
)

# Model save checkpoint
save_checkpoint = 2

# Keep track of initial model weights
init_weights = deepcopy(hf_model.state_dict())

training_args = TrainingArguments(
    output_dir=".",
    num_train_epochs=hyperparams["learning_epoch"],
    per_device_train_batch_size=hyperparams["batch_size"],
    per_device_eval_batch_size=hyperparams["batch_size"],
    weight_decay=0.01,
    logging_dir=".",
    no_cuda=no_cuda,
    save_total_limit=1,
)

# Active Learning Trainer Wrapper
baal_trainer = BaalTransformersTrainer(
    model=hf_model,
    args=training_args,
    train_dataset=active_set,
    eval_dataset=test_set,
    tokenizer=None,
)
logs = {}

logs["epoch"] = 0

The active learning loop below works as follows:

Train model on whatever intial train data we had gathered earlier in our active dataset
Evaluate your model on a seperate evaluation set
Make predictions with dropouts enabled(MCdropout) to gather uncertanities for your pool samples in the active set and use and acqusition function to get the most "important" samples for the human oracle
Once samples have been labelled by human oracle, call the label method of the active dataset which will label the samples. Note, as you might have noticed earlier the active dataset makes sure that once samples are labelled they are removed from the pool and moved to train in the active dataset
Save model if needed
Now finally we load the intial weights of the model back, so that when the active learning loop runs again next time your model is fine tuned based on the intial train data + the new train data which was added in the current active learning loop.

Repeat until some stopping criterion is reached.

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
for epoch in tqdm(range(hyperparams["epoch"])):
    # we use the default setup of HuggingFace for training (ex: epoch=1).
    # The setup is adjustable when BaalHuggingFaceTrainer is defined.
    baal_trainer.train()
    print("\n")

    # Validation!
    eval_metrics = baal_trainer.evaluate()
    print("\n")

    # MCdropout to gather uncertanities
    predictions = baal_trainer.predict_on_dataset(
        active_set.pool, iterations=hyperparams["iterations"]
    )
    print("\n")

    # Acquistion of the most informative and diverse samples based on BatchBALD
    top_uncertainty = heuristic(predictions)[: hyperparams.get("query_size", 1)]

    # Send the samples for labelling from human oracle
    label_from_oracle, points_to_label_oracle = get_label_human_oracle(
        active_set, top_uncertainty
    )

    # Label active dataset
    active_set.label(points_to_label_oracle, label_from_oracle)

    # Save model
    if epoch == save_checkpoint:
        save_model(baal_trainer)

    # We reset the model weights to relearn from the new trainset.
    baal_trainer.load_state_dict(init_weights)
    baal_trainer.lr_scheduler = None

    active_logs = {
        "epoch": epoch,
        "labeled_data": active_set.labelled_map,
        "Next Training set size": len(active_set),
    }
    logs = {**eval_metrics, **active_logs}
for epoch in tqdm(range(hyperparams["epoch"])):
    # we use the default setup of HuggingFace for training (ex: epoch=1).
    # The setup is adjustable when BaalHuggingFaceTrainer is defined.
    baal_trainer.train()
    print("\n")

    # Validation!
    eval_metrics = baal_trainer.evaluate()
    print("\n")

    # MCdropout to gather uncertanities
    predictions = baal_trainer.predict_on_dataset(
        active_set.pool, iterations=hyperparams["iterations"]
    )
    print("\n")

    # Acquistion of the most informative and diverse samples based on BatchBALD
    top_uncertainty = heuristic(predictions)[: hyperparams.get("query_size", 1)]

    # Send the samples for labelling from human oracle
    label_from_oracle, points_to_label_oracle = get_label_human_oracle(
        active_set, top_uncertainty
    )

    # Label active dataset
    active_set.label(points_to_label_oracle, label_from_oracle)

    # Save model
    if epoch == save_checkpoint:
        save_model(baal_trainer)

    # We reset the model weights to relearn from the new trainset.
    baal_trainer.load_state_dict(init_weights)
    baal_trainer.lr_scheduler = None

    active_logs = {
        "epoch": epoch,
        "labeled_data": active_set.labelled_map,
        "Next Training set size": len(active_set),
    }
    logs = {**eval_metrics, **active_logs}

Now you might want to take this labelled data and maybe analyze it or maybe fine tune a different model based on this. Some useful utilities for that

active_set._dataset : Provides access to all the data
active_set.is_labelled(idx) : Lets you know if a sample at idx is labelled via the Active learning process or not
active_set.labelled: A bool numpy array which keeps a record of which samples have been labelled in the AL process.

Using these you can easily pull your labelled data should the need arise.