Shortcuts

Source code for torchtraining.accelerators

"""Accelerators enabling distributed (multi-GPU/multi-node) training.

Accelerators should be instantiated only once and used on top-most
module (in the following order):

    * epoch (if exists)
    * iteration (if exists)
    * step

Those are the only objects which can be "piped" into producers, for example::

    tt.accelerators.Horovod(...) ** tt.iterations.Iteration(...)

And should be used in this way (although it's not always necessary).
See `horovod` module for an example.

"""

import importlib

import torch

from .._base import Accelerator
from ..utils import general as utils

if utils.modules_exist("horovod", "horovod.torch"):

    import horovod.torch as hvd

[docs] class Horovod(Accelerator): """Accelerate training using Uber's Horovod framework. See `torchtraining.accelerators.horovod` package for more information. .. note:: **IMPORTANT**: This object needs `horovod` Python package to be visible. You can install it with `pip install -U torchtraining[horovod]`. Also you should export `CUDA_HOME` variable like this: `CUDA_HOME=/opt/cuda pip install -U torchtraining[horovod]` (your path may vary) Parameters ---------- module: torch.nn.Module Module to be broadcasted to all processes. rank: int, optional Root process rank. Default: `0` per_worker_threads: int, optional Number of threads which can be utilized by each process. Default: `pytorch`'s default comm: List, optional List specifying ranks for the communicator, relative to the `MPI_COMM_WORLD` communicator OR the MPI communicator to use. Given communicator will be duplicated. If `None`, Horovod will use MPI_COMM_WORLD Communicator. Default: `None` """ def __init__( self, model, rank: int = 0, per_worker_threads: int = None, comm=None, ): hvd.init(comm) if torch.cuda.is_available(): torch.cuda.set_device(hvd.local_rank()) if per_worker_threads is not None: if per_worker_threads < 1: raise ValueError("Each worker needs at least one thread to run.") torch.set_num_threads(per_worker_threads) hvd.broadcast_parameters(model.state_dict(), root_rank=rank)
from . import horovod