Source code for torchtraining.pytorch

"""This module provides standard PyTorch operations (like `backward`)
in functional manner.

.. note::

    **IMPORTANT**: This module is used almost all the time
    so be sure to understand how it works.


It allows users to perform training on single `step` for both training and evaluation
using PyTorch's optimizer, backward or zeroing gradient, for example::


    class Step(tt.steps.Step):
        def forward(self, module, sample):
            # Your forward step here
            ...
            return loss, predictions

    training = (
        Step(criterion, gradient=True, device=device)
        ** tt.Select(loss=0)
        ** tt.pytorch.ZeroGrad(network)
        ** tt.pytorch.Backward()
        ** tt.pytorch.Optimize(optimizer)
        ** tt.pytorch.Detach()
    )

    evaluation = (
        Step(criterion, gradient=False, device=device)
        ** tt.Select(predictions=1)
        ** tt.callbacks.Log(writer, "Predicted")
    )

Some other operations are also simplified (e.g. gradient accumulation),
see `torchtraining.callbacks.Optimize`

"""

import torch

from ._base import Operation


[docs]class Detach(Operation):
    """Returns a new Tensor, detached from the current graph.

    .. note::

        **IMPORTANT**: This operation should be used before accumulating
        values after `iteration` in order not to grow backpropagation
        graph.

    Returns
    -------
    torch.Tensor
        Detached tensor

    """

[docs]    def forward(self, data):
        """
        Arguments
        ---------
        torch.Tensor
            Tensor to be detached (new Tensor is returned).

        """
        return data.detach()


[docs]class Schedule(Operation):
    """Run single step of given scheduler.

    Usually placed after each `step` or `iteration` (depending on provided
    scheduler instance).

    Returns
    -------
    torch.Tensor
        Value passed to function initially

    Parameters
    ----------
    scheduler : torch.optim.lr_scheduler._LRScheduler
        Instance of scheduler-like object with interface aligned with
        `torch.optim.lr_scheduler._LRScheduler` base class
    use_data : bool
        Whether input data should be used when stepping scheduler.
    """

    def __init__(self, scheduler, use_data: bool = False):
        super().__init__()
        self.scheduler = scheduler
        self.use_data = use_data

[docs]    def forward(self, data):
        """
        Arguments
        ---------
        torch.Tensor
            Tensor which is optionally used to step scheduler.

        """
        if self.use_data:
            self.scheduler.step(data)
        else:
            self.scheduler.step()

        return data


[docs]class Backward(Operation):
    """Run backpropagation on output tensor.

    Parameters
    ----------
    scaler : torch.cuda.amp.GradScaler, optional
        Gradient scaler used for automatic mixed precision mode.
    accumulate: int, optional
        Divide loss by ``accumulate`` if gradient accumulation is used.
        This approach averages gradient from multiple batches.
        Default: `1` (no accumulation)
    gradient : torch.Tensor, optional
        Tensor used as initial value to backpropagation. If unspecified,
        uses `torch.tensor([1.0])` as default value (just like `tensor.backward()` call).

    Returns
    -------
    torch.Tensor
        Tensor after backward (possibly scaled by `accumulate`)

    """

    def __init__(self, scaler=None, accumulate: int = 1, gradient: torch.Tensor = None):
        super().__init__()
        self.scaler = scaler
        self.accumulate = accumulate
        self.gradient = gradient

[docs]    def forward(self, data):
        """
        Arguments
        ---------
        data: torch.Tensor
            Tensor on which `backward` will be run (possibly accumulated).
            Usually `loss` value

        """
        output = data / self.accumulate
        if self.scaler is not None:
            output = self.scaler.scale(output)
        if self.gradient is not None:
            output.backward(self.gradient)
        else:
            output.backward()

        return output


[docs]class Optimize(Operation):
    """Perform optimization step on `parameters` stored by `optimizer`.

    Currently specifying `closure` and `scaler` is mutually exclusive.

    Parameters
    ----------
    optimizer: torch.optim.Optimizer
        Instance of optimizer-like object with interface aligned with
        `torch.optim.Optimizer`.
    accumulate: int, optional
        Divide loss by ``accumulate`` if gradient accumulation is used.
        This approach averages gradient from multiple batches.
        Default: `1` (no accumulation)
    closure : Callable, optional
        A closure that reevaluates the model and returns the loss.
        Optional for most optimizers. Default: `None`
    scaler : torch.cuda.amp.GradScaler, optional
        Gradient scaler used for automatic mixed precision mode.
        Default: `None`
    *args
        Arguments passed to either `scaler.step` (if specified) or `optimizer.step`
    **kwargs
        Keyword arguments passed to either `scaler.step` (if specified) or `optimizer.step`

    Returns
    -------
    Any
        Anything passed to `forward`.


    """

    def __init__(
        self, optimizer, accumulate: int = 1, closure=None, scaler=None, *args, **kwargs
    ):
        super().__init__()
        self.optimizer = optimizer
        self.accumulate = accumulate

        if scaler is not None and closure is not None:
            raise ValueError("Closure use with scaler is not currently supported.")

        self.scaler = scaler
        self.closure = closure
        self.args = args
        self.kwargs = kwargs

        self._counter = -1

[docs]    def forward(self, data):
        """
        Arguments
        ---------
        data: Any
            Anything as it does not influence this operation.

        """
        self._counter += 1
        if self._counter % self.accumulate:
            if self.scaler is not None:
                self.scaler.step(self.optimizer, *self.args, **self.kwargs)
            else:
                self.optimizer.step(self.closure, *self.args, **self.kwargs)

        return data


[docs]class ZeroGrad(Operation):
    """Zero model or optimizer gradients.

    Function `zero_grad()` will be run on the provided object.
    Usually called after every `step` (or after multiple steps, see `accumulate`
    argument).

    Parameters
    ----------
    obj : torch.optim.Optimizer | torch.nn.Module
        Instance of object to zero gradient on.
    accumulate : int
        Accumulate gradient for specified number of iterations before zero-ing out
        gradient.

    Returns
    -------
    Any
        Anything passed to `forward`.


    """

    def __init__(self, obj, accumulate: int = 1):
        super().__init__()
        self.obj = obj
        self.accumulate = accumulate
        self._counter = -1

[docs]    def forward(self, data):
        """
        Arguments
        ---------
        data: Any
            Anything as it does not influence this operation.

        """
        self._counter += 1
        if self._counter % self.accumulate:
            self.obj.zero_grad()
        return data


[docs]class UpdateGradScaler(Operation):
    """Update gradient scaler used with automatic mixed precision.

    Parameters
    ----------
    scaler : torch.cuda.amp.GradScaler
        Gradient scaler used for automatic mixed precision mode.

    Returns
    -------
    Any
        Anything passed to `forward`.


    """

    def __init__(self, scaler):
        super().__init__()
        self.scaler = scaler

[docs]    def forward(self, data):
        """
        Arguments
        ---------
        data: Any
            Anything as it does not influence this operation.

        """
        self.scaler.update()
        return data