Huggingface

HuggingFace Compatibility

baal.transformers_trainer_wrapper.BaalTransformersTrainer

Bases: Trainer

The purpose of this wrapper is to provide extra capabilities for HuggingFace Trainer, so that it can output several forward pass for samples in prediction time and hence be able to work with baal. For a more detailed description of the arguments refer to ( https://huggingface.co/transformers/v3.0.2/main_classes/trainer.html)

Parameters:

Name	Type	Description	Default
`model`	`transformers.PreTrainedModel`	The model to train, evaluate or use for predictions.	required
`data_collator`	`Optional(Callable`	The function to use to from a batch.	required
`train_dataset`	`Optional(torch.utils.data.Dataset`	The dataset to use for training.	required
`eval_dataset`	`Optional(torch.utils.data.Dataset`	The dataset to use for evaluation.	required
`tokenizer`	`Optional(transformers.PreTrainedTokenizer`	a tokenizer provided by huggingface.	required
`model_init`	`Optional(Dict`	Model initial weights for fine tuning.	required
`compute_metrics`	`Optional(Callable[[EvalPrediction], Dict]`	The function that will be used to compute metrics at evaluation.	required
`callbacks`	`Optional(List[transformers.TrainerCallback]`	A list of callbacks to customize the training loop.	required
`optimizers`	`Optional(Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`	A tuple containing the optimizer and the scheduler to use.	required

Source code in baal/transformers_trainer_wrapper.py

class BaalTransformersTrainer(Trainer):
    """
    The purpose of this wrapper is to provide extra capabilities for HuggingFace Trainer, so that
    it can output several forward pass for samples in prediction time and hence be able to work with
    baal. For a more detailed description of the arguments refer to (
    https://huggingface.co/transformers/v3.0.2/main_classes/trainer.html)

    Args:
        model (transformers.PreTrainedModel): The model to train, evaluate or use for predictions.
        data_collator (Optional(Callable)): The function to use to from a batch.
        train_dataset (Optional(torch.utils.data.Dataset)): The dataset to use for training.
        eval_dataset (Optional(torch.utils.data.Dataset)): The dataset to use for evaluation.
        tokenizer (Optional(transformers.PreTrainedTokenizer)): a tokenizer provided by huggingface.
        model_init (Optional(Dict)): Model initial weights for fine tuning.
        compute_metrics (Optional(Callable[[EvalPrediction], Dict])): The function that will be
            used to compute metrics at evaluation.
        callbacks (Optional(List[transformers.TrainerCallback])): A list of callbacks to customize
            the training loop.
        optimizers (Optional(Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR])):
            A tuple containing the optimizer and the scheduler to use.
    """

    def predict_on_dataset_generator(
        self,
        dataset,
        iterations: int = 1,
        half: bool = False,
        ignore_keys: Optional[List[str]] = None,
    ):
        """
        Use the model to predict on a dataset `iterations` time.

        Args:
            dataset (Dataset): Dataset to predict on.
            iterations (int): Number of iterations per sample.
            half (bool): If True use half precision.
            ignore_keys (Optional[List[str]]): A list of keys in the output of your model
                (if it is a dictionary) that should be ignored when gathering predictions.
        Notes:
            The "batch" is made of `batch_size` * `iterations` samples.

        Returns:
            Generators [batch_size, n_classes, ..., n_iterations].
        """

        dataloader = self.get_eval_dataloader(dataset)
        log.info("Start Predict", dataset=len(dataset))

        model = self.model

        model.eval()
        for step, inputs in enumerate(tqdm(dataloader)):
            inputs = map_on_tensor(
                lambda element: map_on_tensor(lambda d: stack_in_memory(d, iterations), element),
                inputs,
            )
            _, out, _ = self.prediction_step(
                model, inputs, prediction_loss_only=False, ignore_keys=ignore_keys
            )

            out = map_on_tensor(lambda o: o.view([iterations, -1, *o.size()[1:]]), out)
            out = map_on_tensor(lambda o: o.permute(1, *range(3, o.ndimension()), 2, 0), out)
            out = map_on_tensor(lambda x: x.detach(), out)
            if half:
                out = map_on_tensor(lambda x: x.half(), out)
            yield map_on_tensor(lambda x: x.cpu().numpy(), out)

    def predict_on_dataset(
        self,
        dataset,
        iterations: int = 1,
        half: bool = False,
        ignore_keys: Optional[List[str]] = None,
    ):

        """
        Use the model to predict on a dataset `iterations` time.

        Args:
            dataset (Dataset): Dataset to predict on.
            iterations (int): Number of iterations per sample.
            half (bool): If True use half precision.
            ignore_keys (Optional[List[str]]): A list of keys in the output of your model
                (if it is a dictionary) that should be ignored when gathering predictions.
        Notes:
            The "batch" is made of `batch_size` * `iterations` samples.

        Returns:
            Array [n_samples, n_outputs, ..., n_iterations].
        """
        preds = list(
            self.predict_on_dataset_generator(
                dataset=dataset, iterations=iterations, half=half, ignore_keys=ignore_keys
            )
        )

        if len(preds) > 0 and not isinstance(preds[0], Sequence):
            # Is an Array or a Tensor
            return np.vstack(preds)
        return [np.vstack(pr) for pr in zip(*preds)]

    def load_state_dict(self, state_dict, strict=True):
        """Load the model with `state_dict`."""
        self.model.load_state_dict(state_dict, strict=strict)

`load_state_dict(state_dict, strict=True)`

Load the model with state_dict.

Source code in baal/transformers_trainer_wrapper.py

def load_state_dict(self, state_dict, strict=True):
    """Load the model with `state_dict`."""
    self.model.load_state_dict(state_dict, strict=strict)

`predict_on_dataset(dataset, iterations=1, half=False, ignore_keys=None)`

Use the model to predict on a dataset iterations time.

Parameters:

Name	Type	Description	Default
`dataset`	`Dataset`	Dataset to predict on.	required
`iterations`	`int`	Number of iterations per sample.	`1`
`half`	`bool`	If True use half precision.	`False`
`ignore_keys`	`Optional[List[str]]`	A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions.	`None`

Notes

The "batch" is made of batch_size * iterations samples.

Returns:

Type	Description
	Array [n_samples, n_outputs, ..., n_iterations].

Source code in baal/transformers_trainer_wrapper.py

def predict_on_dataset(
    self,
    dataset,
    iterations: int = 1,
    half: bool = False,
    ignore_keys: Optional[List[str]] = None,
):

    """
    Use the model to predict on a dataset `iterations` time.

    Args:
        dataset (Dataset): Dataset to predict on.
        iterations (int): Number of iterations per sample.
        half (bool): If True use half precision.
        ignore_keys (Optional[List[str]]): A list of keys in the output of your model
            (if it is a dictionary) that should be ignored when gathering predictions.
    Notes:
        The "batch" is made of `batch_size` * `iterations` samples.

    Returns:
        Array [n_samples, n_outputs, ..., n_iterations].
    """
    preds = list(
        self.predict_on_dataset_generator(
            dataset=dataset, iterations=iterations, half=half, ignore_keys=ignore_keys
        )
    )

    if len(preds) > 0 and not isinstance(preds[0], Sequence):
        # Is an Array or a Tensor
        return np.vstack(preds)
    return [np.vstack(pr) for pr in zip(*preds)]

`predict_on_dataset_generator(dataset, iterations=1, half=False, ignore_keys=None)`

Use the model to predict on a dataset iterations time.

Parameters:

Name	Type	Description	Default
`dataset`	`Dataset`	Dataset to predict on.	required
`iterations`	`int`	Number of iterations per sample.	`1`
`half`	`bool`	If True use half precision.	`False`
`ignore_keys`	`Optional[List[str]]`	A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions.	`None`

Notes

The "batch" is made of batch_size * iterations samples.

Returns:

Type	Description
	Generators [batch_size, n_classes, ..., n_iterations].

Source code in baal/transformers_trainer_wrapper.py

def predict_on_dataset_generator(
    self,
    dataset,
    iterations: int = 1,
    half: bool = False,
    ignore_keys: Optional[List[str]] = None,
):
    """
    Use the model to predict on a dataset `iterations` time.

    Args:
        dataset (Dataset): Dataset to predict on.
        iterations (int): Number of iterations per sample.
        half (bool): If True use half precision.
        ignore_keys (Optional[List[str]]): A list of keys in the output of your model
            (if it is a dictionary) that should be ignored when gathering predictions.
    Notes:
        The "batch" is made of `batch_size` * `iterations` samples.

    Returns:
        Generators [batch_size, n_classes, ..., n_iterations].
    """

    dataloader = self.get_eval_dataloader(dataset)
    log.info("Start Predict", dataset=len(dataset))

    model = self.model

    model.eval()
    for step, inputs in enumerate(tqdm(dataloader)):
        inputs = map_on_tensor(
            lambda element: map_on_tensor(lambda d: stack_in_memory(d, iterations), element),
            inputs,
        )
        _, out, _ = self.prediction_step(
            model, inputs, prediction_loss_only=False, ignore_keys=ignore_keys
        )

        out = map_on_tensor(lambda o: o.view([iterations, -1, *o.size()[1:]]), out)
        out = map_on_tensor(lambda o: o.permute(1, *range(3, o.ndimension()), 2, 0), out)
        out = map_on_tensor(lambda x: x.detach(), out)
        if half:
            out = map_on_tensor(lambda x: x.half(), out)
        yield map_on_tensor(lambda x: x.cpu().numpy(), out)

baal.active.dataset.nlp_datasets.HuggingFaceDatasets

Bases: Dataset

Support for huggingface.datasets: (https://github.com/huggingface/datasets). The purpose of this wrapper is to separate the labels from the rest of the sample information and make the dataset ready to be used by baal.active.ActiveLearningDataset.

Parameters:

Name	Type	Description	Default
`dataset`	`Dataset`	a dataset provided by huggingface.	required
`tokenizer`	`transformers.PreTrainedTokenizer`	a tokenizer provided by huggingface.	`None`
`target_key`	`str`	target key used in the dataset's dictionary.	`'label'`
`input_key`	`str`	input key used in the dataset's dictionary.	`'sentence'`
`max_seq_len`	`int`	max length of a sequence to be used for padding the shorter sequences.	`128`

Source code in baal/active/dataset/nlp_datasets.py

class HuggingFaceDatasets(Dataset):
    """
    Support for `huggingface.datasets`: (https://github.com/huggingface/datasets).
    The purpose of this wrapper is to separate the labels from the rest of the sample information
    and make the dataset ready to be used by `baal.active.ActiveLearningDataset`.

    Args:
        dataset (Dataset): a dataset provided by huggingface.
        tokenizer (transformers.PreTrainedTokenizer): a tokenizer provided by huggingface.
        target_key (str): target key used in the dataset's dictionary.
        input_key (str): input key used in the dataset's dictionary.
        max_seq_len (int): max length of a sequence to be used for padding the shorter
            sequences.
    """

    def __init__(
        self,
        dataset: HFDataset,
        tokenizer=None,
        target_key: str = "label",
        input_key: str = "sentence",
        max_seq_len: int = 128,
    ):
        self.dataset = dataset
        self.targets, self.texts = self.dataset[target_key], self.dataset[input_key]
        self.targets_list: List = np.unique(self.targets).tolist()
        self.input_ids, self.attention_masks = (
            self._tokenize(tokenizer, max_seq_len) if tokenizer else ([], [])
        )

    @property
    def num_classes(self):
        return len(self.targets_list)

    def _tokenize(self, tokenizer, max_seq_len):
        # For speed purposes, we should use fast tokenizers here, but that is up to the caller
        tokenized = tokenizer(
            self.texts,
            add_special_tokens=True,
            max_length=max_seq_len,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",
            truncation=True,
        )
        return tokenized["input_ids"], tokenized["attention_mask"]

    def label(self, idx: int, value: int):
        """Label the item.

        Args:
            idx: index to label
            value: Value to label the index.
        """
        self.targets[idx] = value

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        target = self.targets_list.index(self.targets[idx])

        return {
            "input_ids": self.input_ids[idx].flatten() if len(self.input_ids) > 0 else None,
            "inputs": self.texts[idx],
            "attention_mask": self.attention_masks[idx].flatten()
            if len(self.attention_masks) > 0
            else None,
            "label": torch.tensor(target, dtype=torch.long),
        }

`label(idx, value)`

Label the item.

Parameters:

Name	Type	Description	Default
`idx`	`int`	index to label	required
`value`	`int`	Value to label the index.	required

Source code in baal/active/dataset/nlp_datasets.py

def label(self, idx: int, value: int):
    """Label the item.

    Args:
        idx: index to label
        value: Value to label the index.
    """
    self.targets[idx] = value