Shortcuts

Source code for torchtraining.accelerators.horovod

"""This module allows user to train networks in distributed manner using `horovod`

.. note::

    **IMPORTANT**: This module is experimental and may not be working
    correctly. Use at your own risk and report any issues you find.

.. note::

    **IMPORTANT**: This module needs `horovod` Python package to be visible.
    You can install it with `pip install -U torchtraining[horovod]`.
    Also you should export `CUDA_HOME` variable like this:
    `CUDA_HOME=/opt/cuda pip install -U torchtraining[horovod]` (your path may vary)



See `Horovod documentation <https://github.com/horovod/horovod>`__ for details
about the framework (installation, capabilities etc.).


Example::

    import torchtraining as tt
    import torchtraining.accumulators.horovod as horovod


    class TrainStep(tt.steps.Train):
        def forward(self, module, sample):
            # Dummy step
            images, labels = sample
            return loss


    model = ...
    criterion = ...
    dataset = ...
    optimizer = ...
    writer = ...


    # Accelerate!
    accelerator = tt.accelerators.Horovod(model, optimize.optimizer)

    # Distributed optimization with gradient accumulation
    optimizer = horovod.optimizer(optimizer, module.named_parameters())

    # Special distributed DataLoader
    dataloader = horovod.DataLoader(dataset, batch_size=64)


    step = (
        TrainStep(criterion, device)
        ** tt.pytorch.ZeroGrad()
        ** tt.pytorch.Backward()
        ** tt.pytorch.Optimize(optimizer)
    )
    iteration = (
        ** tt.iterations.TrainIteration(step, model, dataloader)
        ** horovod.AllReduce()
        ** tt.accumulators.Mean()
        ** horovod.OnRank(tt.callbacks.Tensorboard(writer, "Loss"))
    )

Specific `operations` integrated by `torchtraining` below.

"""

import operator
import pathlib
import pickle
import typing

import torch

import horovod.torch as hvd

from .._base import Operation


def _reduction(name):
    mapping = {
        "sum": hvd.Sum,
        "mean": hvd.Average,
    }
    value = mapping.get(name.lower())
    if value is None:
        raise ValueError(
            "reduction can be one of {}, got {}".format(mapping.keys(), name)
        )
    return value


def _compression(name):
    mapping = {
        "none": hvd.compression.NoneCompressor(),
        "fp16": hvd.compression.FP16Compressor(),
    }
    value = mapping.get(name.lower())
    if value is None:
        raise ValueError(
            "compression can be one of {}, got {}".format(mapping.keys(), compression)
        )
    return value


[docs]class OnRank(Operation): """Run any operation only if it runs in specified process (specified rank). Otherwise return unchanged `data`. Parameters ---------- operation: tt.Operation Operation to run (`callbacks`, `metrics` and whatever else you want). rank: int, optional Rank (process) on which the operation will be run. Default: `0` (main process) Returns ------- data | operation(data) If run in specified process, return `operation(data)`. Otherwise forward data without changes. """ def __init__( self, operation: Operation, rank: int = 0, ): self.operation = operation self.rank = rank
[docs] def forward(self, data: typing.Any): """ Arguments --------- data: Any Input required by `operation` """ if hvd.rank() == self.rank: return self.operation(data) return data
[docs]class DataLoader(torch.utils.data.DataLoader): """PyTorch `torch.utils.data.DataLoader` suited for `horovod` integration. Works exactly like it's PyTorch counterpart but creates appropriate `torch.utils.data.DistributedSampler` under the hood (hence users cannot specify `sampler` or `batch_sampler`). Arguments: ---------- dataset: torch.utils.data.Dataset Dataset from which to load the data. batch_size: int, optional How many samples per batch to load. Default: ``1`` shuffle: bool, optional Set to ``True`` to have the data reshuffled at every epoch. Default: ``False`` num_workers: int, optional How many subprocesses to use for data loading. ``0`` means that the data will be loaded in the main process. Default: ``0`` collate_fn Callable, optional Merges a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a map-style dataset. Default: `None` (default PyTorch collation) pin_memory: bool, optional If ``True``, the data loader will copy `torch.Tensors` into CUDA pinned memory before returning them. Default: `False` drop_last: bool, optional Set to ``True`` to drop the last incomplete batch, if the dataset size is not divisible by the batch size. If ``False`` and the size of dataset is not divisible by the batch size, then the last batch will be smaller. Default: ``False`` timeout: Numeric, optional If positive, the timeout value for collecting a batch from workers. Should always be non-negative. Default: ``0`` worker_init_fn: Callable, optional If not ``None``, this will be called on each worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as input, after seeding and before data loading. Default: ``None`` """ def __init__( self, dataset, batch_size=1, shuffle=False, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None, multiprocessing_context=None, generator=None, sampler_seed=0, ): super().__init__( dataset, batch_size, False, torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=shuffle, seed=sampler_seed, ), None, num_workers, collate_fn, pin_memory, drop_last, timeout, worker_init_fn, multiprocessing_context, generator, )
[docs]class AllReduce(Operation): """Perform reduction of the input tensor over all the processes. If `data` requires gradient you can backpropagate through this operation. Parameters ---------- reduction: str, optional The reduction operation to use when combining gradients across different processes. Can be one of: ["mean", "sum"] being respectively: [hvd.mpi_ops.Average, hvd.mpi_ops.Sum]. Default: "mean" compression: str, optional Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter update step. Can be one of "none" or "fp16". Default: "none" name: str, optional Name of the reduction operator. If not provided it will be generated automatically. Default: `None` (automatic generation) Returns ------- torch.Tensor Tensor with the same shape as `data` averaged (`reduction="mean"`) or summed (`reduction="sum"`) across all processes. """ def __init__(self, reduction: str = "mean", compression="none", name=None): self.name = name self.reduction = _reduction(reduction) self.compression = _compression(compression)
[docs] def forward(self, data): """ Arguments --------- data: torch.Tensor Tensor to be reduced """ return hvd.allreduce( data, name=self.name, compression=self.compression, op=self.reduction )
[docs]class AsyncAllReduce(Operation): """Perform asynchronous reduction of the input tensor over all the processes. User should pipe this object into `tt.accelerators.horovod.Synchronize()` in order to get value. Parameters ---------- reduction: str, optional The reduction operation to use when combining gradients across different processes. Can be one of: ["mean", "sum"] being respectively: [hvd.mpi_ops.Average, hvd.mpi_ops.Sum]. Default: "mean" compression: str, optional Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter update step. Can be one of "none" or "fp16". Default: "none" name: str, optional Name of the reduction operator. If not provided it will be generated automatically. Default: `None` (automatic generation) Returns ------- Handle Handle to be used with `tt.accelerators.horovod.Synchronize()` """ def __init__(self, reduction: str = "mean", compression="none", name=None): self.name = name self.reduction = _reduction(reduction)
[docs] def forward(self, data): """ Arguments --------- data: torch.Tensor Tensor to be reduced across all processes. """ return hvd.allreduce_async(data, name=self.name, op=self.reduction)
[docs]class AllGather(Operation): """Concatenate input tensors from all processes. Tensor after concatenation will be available to all processes. Concatenation is done over `0` th dimension, so it's the only dimension in which `torch.Tensor` on different processes is allowed to be different. If `data` requires gradient you can backpropagate through this operation. Parameters ---------- name: str, optional Name of the reduction operator. If not provided it will be generated automatically. Default: `None` (automatic generation) Returns ------- torch.Tensor Tensor with the same shape as `data` except `0` dimension (which will be larger as it's concatenation of data from all processes). """ def __init__(self, name: str = None): self.name = name
[docs] def forward(self, data): """ Arguments --------- data: torch.Tensor Tensor to be gathered across all processes. """ return hvd.allgather(data, name=self.name)
[docs]class AsyncAllGather(Operation): """Asynchronously concatenate input tensors from all processes. Tensor after concatenation will be available to all processes. Concatenation is done over `0`th dimension, so it's the only dimension in which `torch.Tensor` on different processes is allowed to be different. Parameters ---------- name: str, optional Name of the reduction operator. If not provided it will be generated automatically. Default: `None` (automatic generation) Returns ------- Handle Handle to be used with `tt.accelerators.horovod.Synchronize()` """ def __init__(self, name=None): self.name = name
[docs] def forward(self, data): """ Arguments --------- data: torch.Tensor Tensor to be gathered across all processes. """ return hvd.allgather_async(data, name=self.name,)
[docs]class Broadcast(Operation): """Broadcast tensor from `rank` process to all other processes. If `data` requires gradient you can backpropagate through this operation. Parameters ---------- rank: int, optional Rank of the process from which `data` will be distributed to other processes. name: str, optional Name of the reduction operator. If not provided it will be generated automatically. Default: `None` (automatic generation) Returns ------- torch.Tensor Tensor with the same shape as `data` with broadcasted values. """ def __init__(self, rank: int = 0, name=None): self.rank = rank self.name = name
[docs] def forward(self, data): """ Arguments --------- data: torch.Tensor Tensor to be broadcasted across all processes. """ return hvd.broadcast(data, self.rank, name=self.name)
[docs]class AsyncBroadcast(Operation): """Asynchronously broadcast tensor from `rank` process to all other processes. Parameters ---------- rank: int, optional Rank of the process from which `data` will be distributed to other processes. name: str, optional Name of the reduction operator. If not provided it will be generated automatically. Default: `None` (automatic generation) Returns ------- Handle Handle to be used with `tt.accelerators.horovod.Synchronize()` """ def __init__(self, rank: int = 0, name=None): self.rank = rank self.name = name
[docs] def forward(self, data): """ Arguments --------- data: torch.Tensor Tensor to be broadcasted across all processes. """ return hvd.async_broadcast(data, self.rank, name=self.name)
[docs]class Synchronize(Operation): """Asynchronously broadcast tensor from `rank` process to all other processes. Returns ------- torch.Tensor Value of the previous asynchronous operation after synchronization. Whatever it should return. """
[docs] def forward(self, handle): """ Arguments --------- handle: Handle Handle returned by an `AsyncAllReduce`, `AsyncAllGather`or `AsyncBroadcast` which will be used to retrieve `torch.Tensor`. """ return hvd.synchronize(handle)
[docs]def optimizer( optimizer, named_parameters, reduction: str = "sum", compression: str = "none", accumulate: int = 1, rank: int = 0, ): """Create Horovod compatible optimizer. State of optimizer will be distributed on specified `rank`. Should be used after `torchtraining.accelerators.Horovod` object was created. Parameters ---------- optimizer: torch.optim.Optimizer Instance of optimizer-like object with interface aligned with `torch.optim.Optimizer`. named_parameters: torch.nn.parameter.Parameter A mapping between parameter names and values. Used for naming of allreduce operations. Typically just `model.named_parameters()`. reduction: str, optional The reduction operation to use when combining gradients across different processes. Can be one of: ["mean", "sum"] being respectively: [hvd.mpi_ops.Average, hvd.mpi_ops.Sum]. Default: "mean" compression: str, optional Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter update step. Can be one of "none" or "fp16". Default: "none" accumulate: int, optional Divide loss by ``accumulate`` if gradient accumulation is used. This approach averages gradient from multiple batches. Default: `1` (no accumulation) rank: int, optional Rank from which optimizer's state will be broadcasted. Default: `0` Returns ------- horovod.torch.DistributedOptimizer Instance of optimizer but distributed across workers. """ optimizer = hvd.DistributedOptimizer( optimizer, named_parameters, _compression(compression), backward_passes_per_step=accumulate, op=_reduction(reduction), ) hvd.broadcast_optimizer_state(optimizer, root_rank=rank) return optimizer
[docs]def load(f, rank: int = 0, map_location=None, pickle_module=pickle, **pickle_load_args): """Load object saved with `torch.save` in a single process and distribute to all other processes. Useful when loading saved `torch.nn.Module` (or other `torch` objects like `optimizer`), which is saved on a single machine. It can be easily distributed to other processes this way. If you wish to `torch.save` on a single process you can create an object like this:: save = tt.accelerators.horovod.OnRank(torch.save) save(your_module) Arguments --------- f: file-like A file-like object (has to implement :meth:`read`, :meth`readline`, :meth`tell`, and :meth`seek`) or a string or `os.PathLike` object containing a file name. rank: int, optional Process rank on which data will be loaded. map_location: Callable | `torch.device` | string | dict, optional Specifies how to remap storage locations. Default: `None` pickle_module: module, optional Module used for unpickling metadata and objects, (has to match the :attr:`pickle_module` used to serialize file). Default: `pickle` **pickle_load_args optional keyword arguments passed over to :func:`pickle_module.load` and :func:`pickle_module.Unpickler`, e.g., :attr:`errors=...`. Returns ------- torch.Tensor | torch.nn.Module | Any Anything you saved with `torch.save` really """ data = None if hvd.rank() == rank: data = torch.load(f, map_location, pickle_module, **pickle_load_args) return hvd.broadcast(data, rank)