Skip to content

Huggingface

HuggingFace Compatibility

baal.transformers_trainer_wrapper.BaalTransformersTrainer

Bases: Trainer

The purpose of this wrapper is to provide extra capabilities for HuggingFace Trainer, so that it can output several forward pass for samples in prediction time and hence be able to work with baal. For a more detailed description of the arguments refer to ( https://huggingface.co/transformers/v3.0.2/main_classes/trainer.html)

Parameters:

Name Type Description Default
model PreTrainedModel

The model to train, evaluate or use for predictions.

None
replicate_in_memory

If True, will perform MC-Dropout in a single forward pass. It is faster, but more memory expensive. Default: True.

True
data_collator Optional(Callable

The function to use to from a batch.

required
train_dataset Optional(torch.utils.data.Dataset

The dataset to use for training.

required
eval_dataset Optional(torch.utils.data.Dataset

The dataset to use for evaluation.

required
tokenizer Optional(transformers.PreTrainedTokenizer

a tokenizer provided by huggingface.

required
model_init Optional(Dict

Model initial weights for fine tuning.

required
compute_metrics Optional(Callable[[EvalPrediction], Dict]

The function that will be used to compute metrics at evaluation.

required
callbacks Optional(List[transformers.TrainerCallback]

A list of callbacks to customize the training loop.

required
optimizers Optional(Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]

A tuple containing the optimizer and the scheduler to use.

required
Source code in baal/transformers_trainer_wrapper.py
class BaalTransformersTrainer(Trainer):
    """
    The purpose of this wrapper is to provide extra capabilities for HuggingFace Trainer, so that
    it can output several forward pass for samples in prediction time and hence be able to work with
    baal. For a more detailed description of the arguments refer to (
    https://huggingface.co/transformers/v3.0.2/main_classes/trainer.html)

    Args:
        model (transformers.PreTrainedModel): The model to train, evaluate or use for predictions.
        replicate_in_memory: If True, will perform MC-Dropout in a single forward pass.
            It is faster, but more memory expensive. Default: True.
        data_collator (Optional(Callable)): The function to use to from a batch.
        train_dataset (Optional(torch.utils.data.Dataset)): The dataset to use for training.
        eval_dataset (Optional(torch.utils.data.Dataset)): The dataset to use for evaluation.
        tokenizer (Optional(transformers.PreTrainedTokenizer)): a tokenizer provided by huggingface.
        model_init (Optional(Dict)): Model initial weights for fine tuning.
        compute_metrics (Optional(Callable[[EvalPrediction], Dict])): The function that will be
            used to compute metrics at evaluation.
        callbacks (Optional(List[transformers.TrainerCallback])): A list of callbacks to customize
            the training loop.
        optimizers (Optional(Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR])):
            A tuple containing the optimizer and the scheduler to use.
    """

    def __init__(
        self,
        model: Union[PreTrainedModel, nn.Module] = None,
        args: TrainingArguments = None,
        replicate_in_memory=True,
        **kwargs
    ):
        self.replicate_in_memory = replicate_in_memory
        super().__init__(model=model, args=args, **kwargs)
        raise_warnings_cache_replicated(model, replicate_in_memory)

    def predict_on_dataset_generator(
        self,
        dataset,
        iterations: int = 1,
        half: bool = False,
        ignore_keys: Optional[List[str]] = None,
    ):
        """
        Use the model to predict on a dataset `iterations` time.

        Args:
            dataset (Dataset): Dataset to predict on.
            iterations (int): Number of iterations per sample.
            half (bool): If True use half precision.
            ignore_keys (Optional[List[str]]): A list of keys in the output of your model
                (if it is a dictionary) that should be ignored when gathering predictions.
        Notes:
            The "batch" is made of `batch_size` * `iterations` samples.

        Returns:
            Generators [batch_size, n_classes, ..., n_iterations].
        """

        dataloader = self.get_eval_dataloader(dataset)
        log.info("Start Predict", dataset=len(dataset))

        model = self.model

        model.eval()
        for step, inputs in enumerate(tqdm(dataloader)):
            if self.replicate_in_memory:
                try:
                    # We perform MC-Dropout in a single pass, fast, but memory intensive.
                    inputs = map_on_tensor(
                        lambda element: map_on_tensor(
                            lambda d: stack_in_memory(d, iterations), element
                        ),
                        inputs,
                    )
                    _, out, _ = self.prediction_step(
                        model, inputs, prediction_loss_only=False, ignore_keys=ignore_keys
                    )
                except RuntimeError as e:
                    if "CUDA out of memory" in str(e):
                        raise RuntimeError(
                            """CUDA ran out of memory while Baal tried to replicate data. See the exception above.
                        Use `replicate_in_memory=False` in order to reduce the memory requirements.
                        Note that there will be some speed trade-offs"""
                        ) from e
                    raise e
                out = map_on_tensor(lambda o: o.view([iterations, -1, *o.size()[1:]]), out)
            else:
                # We perform a forward pass `iterations` time. Slower, but memory efficient.
                out = [
                    self.prediction_step(
                        model, inputs, prediction_loss_only=False, ignore_keys=ignore_keys
                    )[1]
                    for _ in range(iterations)
                ]
                out = _stack_preds(out)
            # Swap axes to match Baal [Batch_size, Classes, ..., Iterations]
            out = map_on_tensor(lambda o: o.permute(1, *range(3, o.ndimension()), 2, 0), out)
            out = map_on_tensor(lambda x: x.detach(), out)
            if half:
                out = map_on_tensor(lambda x: x.half(), out)
            yield map_on_tensor(lambda x: x.cpu().numpy(), out)

    def predict_on_dataset(
        self,
        dataset,
        iterations: int = 1,
        half: bool = False,
        ignore_keys: Optional[List[str]] = None,
    ):

        """
        Use the model to predict on a dataset `iterations` time.

        Args:
            dataset (Dataset): Dataset to predict on.
            iterations (int): Number of iterations per sample.
            half (bool): If True use half precision.
            ignore_keys (Optional[List[str]]): A list of keys in the output of your model
                (if it is a dictionary) that should be ignored when gathering predictions.
        Notes:
            The "batch" is made of `batch_size` * `iterations` samples.

        Returns:
            Array [n_samples, n_outputs, ..., n_iterations].
        """
        preds = list(
            self.predict_on_dataset_generator(
                dataset=dataset, iterations=iterations, half=half, ignore_keys=ignore_keys
            )
        )

        if len(preds) > 0 and not isinstance(preds[0], Sequence):
            # Is an Array or a Tensor
            return np.vstack(preds)
        return [np.vstack(pr) for pr in zip(*preds)]

    def load_state_dict(self, state_dict, strict=True):
        """Load the model with `state_dict`."""
        self.model.load_state_dict(state_dict, strict=strict)

load_state_dict(state_dict, strict=True)

Load the model with state_dict.

Source code in baal/transformers_trainer_wrapper.py
def load_state_dict(self, state_dict, strict=True):
    """Load the model with `state_dict`."""
    self.model.load_state_dict(state_dict, strict=strict)

predict_on_dataset(dataset, iterations=1, half=False, ignore_keys=None)

Use the model to predict on a dataset iterations time.

Parameters:

Name Type Description Default
dataset Dataset

Dataset to predict on.

required
iterations int

Number of iterations per sample.

1
half bool

If True use half precision.

False
ignore_keys Optional[List[str]]

A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions.

None

Notes: The "batch" is made of batch_size * iterations samples.

Returns:

Type Description

Array [n_samples, n_outputs, ..., n_iterations].

Source code in baal/transformers_trainer_wrapper.py
def predict_on_dataset(
    self,
    dataset,
    iterations: int = 1,
    half: bool = False,
    ignore_keys: Optional[List[str]] = None,
):

    """
    Use the model to predict on a dataset `iterations` time.

    Args:
        dataset (Dataset): Dataset to predict on.
        iterations (int): Number of iterations per sample.
        half (bool): If True use half precision.
        ignore_keys (Optional[List[str]]): A list of keys in the output of your model
            (if it is a dictionary) that should be ignored when gathering predictions.
    Notes:
        The "batch" is made of `batch_size` * `iterations` samples.

    Returns:
        Array [n_samples, n_outputs, ..., n_iterations].
    """
    preds = list(
        self.predict_on_dataset_generator(
            dataset=dataset, iterations=iterations, half=half, ignore_keys=ignore_keys
        )
    )

    if len(preds) > 0 and not isinstance(preds[0], Sequence):
        # Is an Array or a Tensor
        return np.vstack(preds)
    return [np.vstack(pr) for pr in zip(*preds)]

predict_on_dataset_generator(dataset, iterations=1, half=False, ignore_keys=None)

Use the model to predict on a dataset iterations time.

Parameters:

Name Type Description Default
dataset Dataset

Dataset to predict on.

required
iterations int

Number of iterations per sample.

1
half bool

If True use half precision.

False
ignore_keys Optional[List[str]]

A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions.

None

Notes: The "batch" is made of batch_size * iterations samples.

Returns:

Type Description

Generators [batch_size, n_classes, ..., n_iterations].

Source code in baal/transformers_trainer_wrapper.py
def predict_on_dataset_generator(
    self,
    dataset,
    iterations: int = 1,
    half: bool = False,
    ignore_keys: Optional[List[str]] = None,
):
    """
    Use the model to predict on a dataset `iterations` time.

    Args:
        dataset (Dataset): Dataset to predict on.
        iterations (int): Number of iterations per sample.
        half (bool): If True use half precision.
        ignore_keys (Optional[List[str]]): A list of keys in the output of your model
            (if it is a dictionary) that should be ignored when gathering predictions.
    Notes:
        The "batch" is made of `batch_size` * `iterations` samples.

    Returns:
        Generators [batch_size, n_classes, ..., n_iterations].
    """

    dataloader = self.get_eval_dataloader(dataset)
    log.info("Start Predict", dataset=len(dataset))

    model = self.model

    model.eval()
    for step, inputs in enumerate(tqdm(dataloader)):
        if self.replicate_in_memory:
            try:
                # We perform MC-Dropout in a single pass, fast, but memory intensive.
                inputs = map_on_tensor(
                    lambda element: map_on_tensor(
                        lambda d: stack_in_memory(d, iterations), element
                    ),
                    inputs,
                )
                _, out, _ = self.prediction_step(
                    model, inputs, prediction_loss_only=False, ignore_keys=ignore_keys
                )
            except RuntimeError as e:
                if "CUDA out of memory" in str(e):
                    raise RuntimeError(
                        """CUDA ran out of memory while Baal tried to replicate data. See the exception above.
                    Use `replicate_in_memory=False` in order to reduce the memory requirements.
                    Note that there will be some speed trade-offs"""
                    ) from e
                raise e
            out = map_on_tensor(lambda o: o.view([iterations, -1, *o.size()[1:]]), out)
        else:
            # We perform a forward pass `iterations` time. Slower, but memory efficient.
            out = [
                self.prediction_step(
                    model, inputs, prediction_loss_only=False, ignore_keys=ignore_keys
                )[1]
                for _ in range(iterations)
            ]
            out = _stack_preds(out)
        # Swap axes to match Baal [Batch_size, Classes, ..., Iterations]
        out = map_on_tensor(lambda o: o.permute(1, *range(3, o.ndimension()), 2, 0), out)
        out = map_on_tensor(lambda x: x.detach(), out)
        if half:
            out = map_on_tensor(lambda x: x.half(), out)
        yield map_on_tensor(lambda x: x.cpu().numpy(), out)

baal.active.dataset.nlp_datasets.HuggingFaceDatasets

Bases: Dataset

Support for huggingface.datasets: (https://github.com/huggingface/datasets). The purpose of this wrapper is to separate the labels from the rest of the sample information and make the dataset ready to be used by baal.active.ActiveLearningDataset.

Parameters:

Name Type Description Default
dataset Dataset

a dataset provided by huggingface.

required
tokenizer PreTrainedTokenizer

a tokenizer provided by huggingface.

None
target_key str

target key used in the dataset's dictionary.

'label'
input_key str

input key used in the dataset's dictionary.

'sentence'
max_seq_len int

max length of a sequence to be used for padding the shorter sequences.

128
Source code in baal/active/dataset/nlp_datasets.py
class HuggingFaceDatasets(Dataset):
    """
    Support for `huggingface.datasets`: (https://github.com/huggingface/datasets).
    The purpose of this wrapper is to separate the labels from the rest of the sample information
    and make the dataset ready to be used by `baal.active.ActiveLearningDataset`.

    Args:
        dataset (Dataset): a dataset provided by huggingface.
        tokenizer (transformers.PreTrainedTokenizer): a tokenizer provided by huggingface.
        target_key (str): target key used in the dataset's dictionary.
        input_key (str): input key used in the dataset's dictionary.
        max_seq_len (int): max length of a sequence to be used for padding the shorter
            sequences.
    """

    def __init__(
        self,
        dataset: HFDataset,
        tokenizer=None,
        target_key: str = "label",
        input_key: str = "sentence",
        max_seq_len: int = 128,
    ):
        self.dataset = dataset
        self.targets, self.texts = self.dataset[target_key], self.dataset[input_key]
        self.targets_list: List = np.unique(self.targets).tolist()
        self.input_ids, self.attention_masks = (
            self._tokenize(tokenizer, max_seq_len) if tokenizer else ([], [])
        )

    @property
    def num_classes(self):
        return len(self.targets_list)

    def _tokenize(self, tokenizer, max_seq_len):
        # For speed purposes, we should use fast tokenizers here, but that is up to the caller
        tokenized = tokenizer(
            self.texts,
            add_special_tokens=True,
            max_length=max_seq_len,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",
            truncation=True,
        )
        return tokenized["input_ids"], tokenized["attention_mask"]

    def label(self, idx: int, value: int):
        """Label the item.

        Args:
            idx: index to label
            value: Value to label the index.
        """
        self.targets[idx] = value

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        target = self.targets_list.index(self.targets[idx])

        return {
            "input_ids": self.input_ids[idx].flatten() if len(self.input_ids) > 0 else None,
            "inputs": self.texts[idx],
            "attention_mask": self.attention_masks[idx].flatten()
            if len(self.attention_masks) > 0
            else None,
            "label": torch.tensor(target, dtype=torch.long),
        }

label(idx, value)

Label the item.

Parameters:

Name Type Description Default
idx int

index to label

required
value int

Value to label the index.

required
Source code in baal/active/dataset/nlp_datasets.py
def label(self, idx: int, value: int):
    """Label the item.

    Args:
        idx: index to label
        value: Value to label the index.
    """
    self.targets[idx] = value