Integrate xdoctest - Rebased (#82797)

This is a new version of #15648 based on the latest master branch.

Unlike the previous PR where I fixed a lot of the doctests in addition to integrating xdoctest, I'm going to reduce the scope here. I'm simply going to integrate xdoctest, and then I'm going to mark all of the failing tests as "SKIP". This will let xdoctest run on the dashboards, provide some value, and still let the dashboards pass. I'll leave fixing the doctests themselves to another PR.

In my initial commit, I do the bare minimum to get something running with failing dashboards. The few tests that I marked as skip are causing segfaults. Running xdoctest results in 293 failed, 201 passed tests. The next commits will be to disable those tests. (unfortunately I don't have a tool that will insert the `#xdoctest: +SKIP` directive over every failing test, so I'm going to do this mostly manually.)

Fixes https://github.com/pytorch/pytorch/issues/71105

@ezyang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/82797
Approved by: https://github.com/ezyang
This commit is contained in:
joncrall
2022-08-12 02:08:01 +00:00
committed by PyTorch MergeBot
parent ba90c9f229
commit 4618371da5
182 changed files with 830 additions and 386 deletions

View File

@ -164,6 +164,11 @@ pytest-rerunfailures
#Pinned versions:
#test that import:
#xdoctest
#Description: runs doctests in pytest
#Pinned versions:
#test that import:
#PyYAML
#Description: data serialization format
#Pinned versions:

View File

@ -17,6 +17,8 @@ pip install "unittest-xml-reporting<=3.2.0,>=2.0.0" \
pytest \
pytest-xdist \
pytest-rerunfailures
# TODO: enable xdoctest later
# xdoctest
if [ -z "${CI}" ]; then
rm -rf "${WORKSPACE_DIR}"/miniconda3/lib/python3.6/site-packages/torch*

View File

@ -36,7 +36,8 @@ popd
=======
:: Pin unittest-xml-reporting to freeze printing test summary logic, related: https://github.com/pytorch/pytorch/issues/69014
pip install "ninja==1.10.0.post1" future "hypothesis==5.35.1" "expecttest==0.1.3" "librosa>=0.6.2" "scipy==1.6.3" psutil pillow "unittest-xml-reporting<=3.2.0,>=2.0.0" pytest pytest-xdist pytest-rerunfailures
pip install "ninja==1.10.0.post1" future "hypothesis==5.35.1" "expecttest==0.1.3" "librosa>=0.6.2" "scipy==1.6.3" psutil pillow "unittest-xml-reporting<=3.2.0,>=2.0.0" pytest pytest-xdist pytest-rerunfailures
:: # TODO: enable xdoctest later
if errorlevel 1 exit /b
if not errorlevel 0 exit /b

View File

@ -490,6 +490,51 @@ def coverage_post_process(app, exception):
for o in output:
f.write(o)
def process_docstring(app, what_, name, obj, options, lines):
"""
Custom process to transform docstring lines Remove "Ignore" blocks
Args:
app (sphinx.application.Sphinx): the Sphinx application object
what (str):
the type of the object which the docstring belongs to (one of
"module", "class", "exception", "function", "method", "attribute")
name (str): the fully qualified name of the object
obj: the object itself
options: the options given to the directive: an object with
attributes inherited_members, undoc_members, show_inheritance
and noindex that are true if the flag option of same name was
given to the auto directive
lines (List[str]): the lines of the docstring, see above
References:
https://www.sphinx-doc.org/en/1.5.1/_modules/sphinx/ext/autodoc.html
https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html
"""
import re
remove_directives = [
# Remove all xdoctest directives
re.compile(r'\s*>>>\s*#\s*x?doctest:\s*.*'),
re.compile(r'\s*>>>\s*#\s*x?doc:\s*.*'),
]
filtered_lines = [
line for line in lines
if not any(pat.match(line) for pat in remove_directives)
]
# Modify the lines inplace
lines[:] = filtered_lines
# make sure there is a blank line at the end
if lines and lines[-1].strip():
lines.append('')
# Called automatically by Sphinx, making this `conf.py` an "extension".
def setup(app):
# NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value
@ -506,6 +551,7 @@ def setup(app):
add_css(css_file)
app.connect("build-finished", coverage_post_process)
app.connect('autodoc-process-docstring', process_docstring)
# From PyTorch 1.5, we now use autogenerated files to document classes and
# functions. This breaks older references since

View File

@ -5,6 +5,7 @@ dependencies:
- numpy
- pytest
- pytest-cov
- xdoctest
- codecov
- pip
- pyyaml

View File

@ -7,6 +7,11 @@ addopts =
# capture only Python print and C++ py::print, but not C output (low-level Python errors)
--capture=sys
--disable-warnings
# TODO: enable xdoctest later
#--xdoctest
#--xdoctest-style=google
#--xdoctest-global-exec="from torch import nn\nimport torch.nn.functional as F\nimport torch"
#--xdoctest-options=+IGNORE_WHITESPACE
testpaths =
test
junit_logging_reruns = all

29
test/run_doctests.sh Executable file
View File

@ -0,0 +1,29 @@
#!/bin/bash
__doc__="
This script simply runs the torch doctests via the xdoctest runner.
This must be run from the root of the torch repo, as it needs the path to the
torch source code.
"
#xdoctest -m torch --style=google list
# Reference: https://stackoverflow.com/questions/59895/bash-script-dir
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
TORCH_MODPATH=$SCRIPT_DIR/../torch
echo "TORCH_MODPATH = $TORCH_MODPATH"
if [[ ! -d "$TORCH_MODPATH" ]] ; then
echo "Could not find the path to the torch module"
else
# Next version of xdoctest will support environment variables that overlo
export XDOCTEST_GLOBAL_EXEC="from torch import nn\nimport torch.nn.functional as F\nimport torch"
export XDOCTEST_OPTIONS="+IGNORE_WHITESPACE"
# Note: google wont catch numpy style docstrings (a few exist) but it also wont fail
# on things not intended to be doctests.
export XDOCTEST_STYLE="google"
xdoctest "$TORCH_MODPATH" --style="$XDOCTEST_STYLE" --global-exec "$XDOCTEST_GLOBAL_EXEC" --options="$XDOCTEST_OPTIONS"
fi

View File

@ -348,6 +348,20 @@ def get_executable_command(options, allow_pytest, disable_coverage=False):
if options.pytest:
if allow_pytest:
executable += ["-m", "pytest"]
# Enable xdoctest
# TODO: enable xdoctest later
# Many doctests assume the existence of these variables
# xdoctest_global_exec_lines = r'\n'.join([
# 'from torch import nn',
# 'import torch.nn.functional as F',
# 'import torch',
# ])
# executable += [
# "--xdoctest",
# "--xdoctest-style=google",
# f"--xdoctest-global-exec='{xdoctest_global_exec_lines}'",
# "--xdoctest-options=+IGNORE_WHITESPACE"
# ]
else:
print_to_stderr(
"Pytest cannot be used for this test. Falling back to unittest."

View File

@ -318,6 +318,7 @@ def set_default_tensor_type(t):
Example::
>>> # xdoctest: +SKIP("Other tests may have changed the default type. Can we reset it?")
>>> torch.tensor([1.2, 3]).dtype # initial default for floating point is torch.float32
torch.float32
>>> torch.set_default_tensor_type(torch.DoubleTensor)
@ -354,6 +355,7 @@ def set_default_dtype(d):
Either torch.float32 or torch.float64.
Example:
>>> # xdoctest: +SKIP("Other tests may have changed the default type. Can we reset it?")
>>> # initial default for floating point is torch.float32
>>> # Python floats are interpreted as float32
>>> torch.tensor([1.2, 3]).dtype
@ -493,6 +495,7 @@ def use_deterministic_algorithms(mode, *, warn_only=False):
>>> torch.use_deterministic_algorithms(True)
# Forward mode nondeterministic error
>>> # xdoctest: +SKIP
>>> torch.randn(10, device='cuda').kthvalue(0)
...
RuntimeError: kthvalue CUDA does not have a deterministic implementation...

View File

@ -128,6 +128,7 @@ def update_names(tensor, names, rename_map, inplace):
>>> x.rename('batch', '...', 'width').names
('batch', 'C', 'H', 'width')
```
tensor.rename(**rename_map) returns a view on tensor that has rename dims
@ -138,6 +139,7 @@ def update_names(tensor, names, rename_map, inplace):
>>> x = torch.empty(2, 3, 5, 7, names=('N', 'C', 'H', 'W'))
>>> x.rename(W='width', H='height').names
('N', 'C', 'height', 'width')
```
Finally, tensor.rename has an in-place version called tensor.rename_.

View File

@ -103,6 +103,7 @@ class TorchRefsMode(torch.overrides.TorchFunctionMode):
Switches the interpretation of torch.* functions and Tensor methods to
use PrimTorch refs in torch._refs. (Direct calls to _refs are unaffected.)
>>> # xdoctest: +SKIP
>>> with TorchRefsMode():
... torch.add(x, y) # calls torch._refs.add(x, y)

View File

@ -1197,7 +1197,7 @@ class Tensor(torch._C._TensorBase):
>>> renamed_imgs = imgs.rename(None)
>>> renamed_imgs.names
(None,)
(None, None, None, None)
>>> renamed_imgs = imgs.rename('batch', 'channel', 'height', 'width')
>>> renamed_imgs.names

View File

@ -135,6 +135,7 @@ def fuse_modules(model, modules_to_fuse, inplace=False, fuser_func=fuse_known_mo
Examples::
>>> # xdoctest: +SKIP
>>> m = M().eval()
>>> # m is a module containing the sub-modules below
>>> modules_to_fuse = [ ['conv1', 'bn1', 'relu1'], ['submodule.conv', 'submodule.relu']]

View File

@ -21,6 +21,7 @@ def fuse_conv_bn(is_qat, conv, bn):
>>> m1 = nn.Conv2d(10, 20, 3)
>>> b1 = nn.BatchNorm2d(20)
>>> # xdoctest: +SKIP
>>> m2 = fuse_conv_bn(m1, b1)
"""
assert(conv.training == bn.training),\
@ -58,6 +59,7 @@ def fuse_conv_bn_relu(is_qat, conv, bn, relu):
>>> m1 = nn.Conv2d(10, 20, 3)
>>> b1 = nn.BatchNorm2d(20)
>>> r1 = nn.ReLU(inplace=False)
>>> # xdoctest: +SKIP
>>> m2 = fuse_conv_bn_relu(m1, b1, r1)
"""
assert(conv.training == bn.training == relu.training),\
@ -103,6 +105,7 @@ def fuse_linear_bn(is_qat, linear, bn):
>>> m1 = nn.Linear(20, 10)
>>> b1 = nn.BatchNorm1d(10)
>>> # xdoctest: +SKIP
>>> m2 = fuse_linear_bn(m1, b1)
"""
assert(linear.training == bn.training),\
@ -130,6 +133,7 @@ def fuse_convtranspose_bn(is_qat, convt, bn):
>>> m1 = nn.ConvTranspose2d(10, 20, 3)
>>> b1 = nn.BatchNorm2d(20)
>>> # xdoctest: +SKIP
>>> m2 = fuse_convtranspose_bn(m1, b1)
"""
assert(convt.training == bn.training),\

View File

@ -74,6 +74,7 @@ class ModelReport:
8.) Call model_report.generate_qconfigs to generate the qconfigs based on the report suggestions
Example (with QuantizationTracer):
>>> # xdoctest: +SKIP
>>> # get the necessary qconfig
>>> config = PrepareCustomConfig()
>>> skipped_module_names, skipped_module_classes = get_skipped_module_name_and_classes(config, False)

View File

@ -321,10 +321,11 @@ class ModelReportVisualizer:
The rest of the rows will contain data
Example Use:
>>> # xdoctest: +SKIP("undefined variables")
>>> mod_report_visualizer.generate_filtered_tables(
feature_filter = "per_channel_min",
module_fqn_filter = "block1"
) # generates table with per_channel_min info for all modules in block 1 of the model
... feature_filter = "per_channel_min",
... module_fqn_filter = "block1"
... ) # generates table with per_channel_min info for all modules in block 1 of the model
"""
# first get the filtered data
filtered_data: OrderedDict[str, Any] = self._get_filtered_data(feature_filter, module_fqn_filter)
@ -403,12 +404,13 @@ class ModelReportVisualizer:
Default = "", results in all the modules in the reports to be visible in the table
Example Use:
>>> # xdoctest: +SKIP("undefined variables")
>>> mod_report_visualizer.generate_table_visualization(
feature_filter = "per_channel_min",
module_fqn_filter = "block1"
)
# prints out neatly formatted table with per_channel_min info for
all modules in block 1 of the model
... feature_filter = "per_channel_min",
... module_fqn_filter = "block1"
... )
>>> # prints out neatly formatted table with per_channel_min info
>>> # for all modules in block 1 of the model
"""
# see if we got tabulate
if not got_tabulate:
@ -552,13 +554,14 @@ class ModelReportVisualizer:
Default = "", results in all the modules in the reports to be visible in the table
Example Use:
>>> # xdoctest: +SKIP("undefined variables")
>>> mod_report_visualizer.generate_plot_visualization(
feature_filter = "per_channel_min",
module_fqn_filter = "block1"
)
# outputs line plot of per_channel_min information for all modules in block1 of model
each channel gets it's own line, and it's plotted across the in-order modules
on the x-axis
... feature_filter = "per_channel_min",
... module_fqn_filter = "block1"
... )
>>> # outputs line plot of per_channel_min information for all
>>> # modules in block1 of model each channel gets it's own line,
>>> # and it's plotted across the in-order modules on the x-axis
"""
# checks if we have matplotlib and let's user know to install it if don't
if not got_matplotlib:
@ -613,10 +616,11 @@ class ModelReportVisualizer:
Default = 10, the values will be split into 10 equal sized bins
Example Use:
>>> # xdoctest: +SKIP
>>> mod_report_visualizer.generategenerate_histogram_visualization_plot_visualization(
feature_filter = "per_channel_min",
module_fqn_filter = "block1"
)
... feature_filter = "per_channel_min",
... module_fqn_filter = "block1"
... )
# outputs histogram of per_channel_min information for all modules in block1 of model
information is gathered across all channels for all modules in block 1 for the
per_channel_min and is displayed in a histogram of equally sized bins

View File

@ -83,6 +83,7 @@ def _with_args(cls_or_self, **kwargs):
Example::
>>> # xdoctest: +SKIP("Undefined vars")
>>> Foo.with_args = classmethod(_with_args)
>>> foo_builder = Foo.with_args(a=3, b=4).with_args(answer=42)
>>> foo_instance1 = foo_builder()
@ -103,11 +104,12 @@ def _with_callable_args(cls_or_self, **kwargs):
Example::
>>> # xdoctest: +SKIP("Undefined vars")
>>> Foo.with_callable_args = classmethod(_with_callable_args)
>>> Foo.with_args = classmethod(_with_args)
>>> foo_builder = Foo.with_callable_args(cur_time=get_time_func).with_args(name="dan")
>>> foo_instance1 = foo_builder()
>>> wait 50
>>> # wait 50
>>> foo_instance2 = foo_builder()
>>> id(foo_instance1.creation_time) == id(foo_instance2.creation_time)
False

View File

@ -30,32 +30,33 @@ class ActivationSparsifier:
specifies how inputs should be aggregated over time.
The aggregate_fn should usually take 2 torch tensors and return the aggregated tensor.
Example
>>> def add_agg_fn(tensor1, tensor2): return tensor1 + tensor2
reduce_fn (Optional, Callable):
default reduce_fn that is used if not specified while registering the layer.
reduce_fn will be called on the aggregated tensor i.e. the tensor obtained after
calling agg_fn() on all inputs.
Example
>>> def mean_reduce_fn(agg_tensor): return agg_tensor.mean(dim=0)
mask_fn (Optional, Callable):
default mask_fn that is used to create the sparsification mask using the tensor obtained after
calling the reduce_fn(). This is used by default if a custom one is passed in the
register_layer().
Note that the mask_fn() definition should contain the sparse arguments that is passed in sparse_config
arguments.
features (Optional, list):
default selected features to sparsify.
If this is non-empty, then the mask_fn will be applied for each feature of the input.
For example,
>>> mask = [mask_fn(reduce_fn(aggregated_fn(input[feature])) for feature in features]
feature_dim (Optional, int):
default dimension of input features. Again, features along this dim will be chosen
for sparsification.
sparse_config (Dict):
Default configuration for the mask_fn. This config will be passed
with the mask_fn()
def add_agg_fn(tensor1, tensor2): return tensor1 + tensor2
reduce_fn (Optional, Callable):
default reduce_fn that is used if not specified while registering the layer.
reduce_fn will be called on the aggregated tensor i.e. the tensor obtained after
calling agg_fn() on all inputs.
Example
def mean_reduce_fn(agg_tensor): return agg_tensor.mean(dim=0)
mask_fn (Optional, Callable):
default mask_fn that is used to create the sparsification mask using the tensor obtained after
calling the reduce_fn(). This is used by default if a custom one is passed in the
register_layer().
Note that the mask_fn() definition should contain the sparse arguments that is passed in sparse_config
arguments.
features (Optional, list):
default selected features to sparsify.
If this is non-empty, then the mask_fn will be applied for each feature of the input.
For example,
mask = [mask_fn(reduce_fn(aggregated_fn(input[feature])) for feature in features]
feature_dim (Optional, int):
default dimension of input features. Again, features along this dim will be chosen
for sparsification.
sparse_config (Dict):
Default configuration for the mask_fn. This config will be passed
with the mask_fn()
Example:
>>> # xdoctest: +SKIP
>>> model = SomeModel()
>>> act_sparsifier = ActivationSparsifier(...) # init activation sparsifier
>>> # Initialize aggregate_fn
@ -74,6 +75,7 @@ class ActivationSparsifier:
>>> act_sparsifier.register_layer(model.some_layer, aggregate_fn=agg_fn, reduce_fn=reduce_fn, mask_fn=mask_fn)
>>>
>>> # start training process
>>> for _ in [...]:
>>> # epoch starts
>>> # model.forward(), compute_loss() and model.backwards()
>>> # epoch ends

View File

@ -89,11 +89,11 @@ class BaseDataScheduler(object):
is called.
Example:
>>> def get_schedule_param(self):
new_param = {}
for name in self.sparsifier.data_groups.keys():
new_param[name] = self.sparsifier.data_groups[name][self.schedule_param] * 0.5
return new_param
>>> def get_schedule_param(self):
... new_param = {}
... for name in self.sparsifier.data_groups.keys():
... new_param[name] = self.sparsifier.data_groups[name][self.schedule_param] * 0.5
... return new_param
When the step() function is called, the value in self.sparsifier.data_groups[name][self.schedule_param]
would be halved

View File

@ -51,7 +51,7 @@ class BaseDataSparsifier(base_sparsifier.BaseSparsifier):
configuration. Only the keys that don't exist in the `config` will
be updated.
Example::
>>> # xdoctest: +SKIP
>>> data_list = [('tensor_1', torch.randn(3,3)), ('tensor_2', torch.randn(4,4))]
>>> defaults = {'sparsity_level': 0.7}
>>> sparsifier = DerivedDataSparsifier(data_list = data_list, **defaults) # Some sparsifier that inherits BaseDataSparsifier

View File

@ -19,6 +19,7 @@ class LambdaSL(BaseScheduler):
>>> # Assuming sparsifier has two groups.
>>> lambda1 = lambda epoch: epoch // 30
>>> lambda2 = lambda epoch: 0.95 ** epoch
>>> # xdoctest: +SKIP
>>> scheduler = LambdaSL(sparsifier, sl_lambda=[lambda1, lambda2])
>>> for epoch in range(100):
>>> train(...)

View File

@ -43,7 +43,8 @@ class BaseSparsifier(abc.ABC):
Example::
>>> config = [{'tensor_fqn': 'layer1.weight', {'tensor_fqn': 'linear2.weight2', 'sparsity_level': 0.5}]
>>> # xdoctest: +SKIP("Can't instantiate abstract class BaseSparsifier with abstract method update_mask")
>>> config = [{'tensor_fqn': 'layer1.weight', 'tensor_fqn': 'linear2.weight2', 'sparsity_level': 0.5}]
>>> defaults = {'sparsity_level': 0.7}
>>> # model.layer1.weight will have `sparsity_level` = 0.7 (getting default)
>>> sparsifier = BaseSparsifier(config, defaults)
@ -233,6 +234,7 @@ class BaseSparsifier(abc.ABC):
to save in the `sparse_params`
Examples:
>>> # xdoctest: +SKIP("locals are undefined")
>>> # Don't save any sparse params
>>> sparsifier.squash_mask()
>>> hasattr(model.submodule1, 'sparse_params')

View File

@ -56,6 +56,7 @@ def make_dual(tensor, tangent, *, level=None):
Example::
>>> # xdoctest: +SKIP("Undefined variables")
>>> with dual_level():
... inp = make_dual(x, v)
... out = f(inp)
@ -95,6 +96,7 @@ def unpack_dual(tensor, *, level=None):
Example::
>>> # xdoctest: +SKIP("Undefined variables")
>>> with dual_level():
... inp = make_dual(x, x_t)
... out = f(inp)
@ -130,6 +132,7 @@ class dual_level(_DecoratorContextManager):
Example::
>>> # xdoctest: +SKIP("Undefined variables")
>>> x = torch.tensor([1])
>>> x_t = torch.tensor([1])
>>> with dual_level():

View File

@ -83,6 +83,7 @@ class FunctionCtx(object):
See :ref:`extending-autograd` for more details on how to use this method.
Example::
>>> # xdoctest: +SKIP
>>> class Func(torch.autograd.Function):
>>> @staticmethod
>>> def forward(ctx, x: torch.Tensor, y: torch.Tensor, z: int):
@ -149,6 +150,7 @@ class FunctionCtx(object):
>>> b = a * a
>>> Inplace.apply(a) # This would lead to wrong gradients!
>>> # but the engine would not know unless we mark_dirty
>>> # xdoctest: +SKIP
>>> b.backward() # RuntimeError: one of the variables needed for gradient
>>> # computation has been modified by an inplace operation
@ -314,6 +316,7 @@ class Function(with_metaclass(FunctionMeta, _C._FunctionBase, FunctionCtx, _Hook
>>> return grad_output * result
>>>
>>> # Use it by calling the apply method:
>>> # xdoctest: +SKIP
>>> output = Exp.apply(input)
"""
def __init__(self, *args, **kwargs):

View File

@ -240,6 +240,7 @@ def vjp(func, inputs, v=None, create_graph=False, strict=False):
... return x.exp().sum(dim=1)
>>> inputs = torch.rand(4, 4)
>>> v = torch.ones(4)
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> vjp(exp_reducer, inputs, v)
(tensor([5.7817, 7.2458, 5.7830, 6.7782]),
tensor([[1.4458, 1.3962, 1.3042, 1.6354],
@ -336,6 +337,7 @@ def jvp(func, inputs, v=None, create_graph=False, strict=False):
... return x.exp().sum(dim=1)
>>> inputs = torch.rand(4, 4)
>>> v = torch.ones(4, 4)
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> jvp(exp_reducer, inputs, v)
(tensor([6.3090, 4.6742, 7.9114, 8.2106]),
tensor([6.3090, 4.6742, 7.9114, 8.2106]))
@ -535,6 +537,7 @@ def jacobian(func, inputs, create_graph=False, strict=False, vectorize=False, st
>>> def exp_reducer(x):
... return x.exp().sum(dim=1)
>>> inputs = torch.rand(2, 2)
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> jacobian(exp_reducer, inputs)
tensor([[[1.4917, 2.4352],
[0.0000, 0.0000]],
@ -744,6 +747,7 @@ def hessian(func, inputs, create_graph=False, strict=False, vectorize=False, out
>>> def pow_reducer(x):
... return x.pow(3).sum()
>>> inputs = torch.rand(2, 2)
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> hessian(pow_reducer, inputs)
tensor([[[[5.2265, 0.0000],
[0.0000, 0.0000]],
@ -847,6 +851,7 @@ def vhp(func, inputs, v=None, create_graph=False, strict=False):
... return x.pow(3).sum()
>>> inputs = torch.rand(2, 2)
>>> v = torch.ones(2, 2)
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> vhp(pow_reducer, inputs, v)
(tensor(0.5591),
tensor([[1.0689, 1.2431],
@ -936,6 +941,7 @@ def hvp(func, inputs, v=None, create_graph=False, strict=False):
... return x.pow(3).sum()
>>> inputs = torch.rand(2, 2)
>>> v = torch.ones(2, 2)
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> hvp(pow_reducer, inputs, v)
(tensor(0.1448),
tensor([[2.0239, 1.6456],

View File

@ -110,7 +110,7 @@ class no_grad(_DecoratorContextManager):
your dual tensors.
Example::
>>> # xdoctest: +SKIP
>>> x = torch.tensor([1.], requires_grad=True)
>>> with torch.no_grad():
... y = x * 2
@ -156,7 +156,7 @@ class enable_grad(_DecoratorContextManager):
This API does not apply to :ref:`forward-mode AD <forward-mode-ad>`.
Example::
>>> # xdoctest: +SKIP
>>> x = torch.tensor([1.], requires_grad=True)
>>> with torch.no_grad():
... with torch.enable_grad():
@ -165,6 +165,7 @@ class enable_grad(_DecoratorContextManager):
True
>>> y.backward()
>>> x.grad
tensor([2.])
>>> @torch.enable_grad()
... def doubler(x):
... return x * 2
@ -205,18 +206,18 @@ class set_grad_enabled(_DecoratorContextManager):
This API does not apply to :ref:`forward-mode AD <forward-mode-ad>`.
Example::
>>> # xdoctest: +SKIP
>>> x = torch.tensor([1.], requires_grad=True)
>>> is_train = False
>>> with torch.set_grad_enabled(is_train):
... y = x * 2
>>> y.requires_grad
False
>>> torch.set_grad_enabled(True)
>>> _ = torch.set_grad_enabled(True)
>>> y = x * 2
>>> y.requires_grad
True
>>> torch.set_grad_enabled(False)
>>> _ = torch.set_grad_enabled(False)
>>> y = x * 2
>>> y.requires_grad
False
@ -268,6 +269,7 @@ class inference_mode(_DecoratorContextManager):
... y = x * x
>>> y.requires_grad
False
>>> # xdoctest: +SKIP("want string isnt quite right")
>>> y._version
Traceback (most recent call last):
File "<stdin>", line 1, in <module>

View File

@ -47,11 +47,11 @@ class saved_tensors_hooks():
>>> b = torch.ones(5, requires_grad=True) * 2
>>> with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook):
... y = a * b
Packing tensor([1., 1., 1., 1., 1.])
Packing tensor([2., 2., 2., 2., 2.])
Packing tensor([1., 1., 1., 1., 1.], requires_grad=True)
Packing tensor([2., 2., 2., 2., 2.], grad_fn=<MulBackward0>)
>>> y.sum().backward()
Unpacking tensor([1., 1., 1., 1., 1.])
Unpacking tensor([2., 2., 2., 2., 2.])
Unpacking tensor([1., 1., 1., 1., 1.], requires_grad=True)
Unpacking tensor([2., 2., 2., 2., 2.], grad_fn=<MulBackward0>)
.. warning ::
Performing an inplace operation on the input to either hooks may lead
@ -93,6 +93,7 @@ class save_on_cpu(saved_tensors_hooks):
Example::
>>> # xdoctest: +REQUIRES(env:CUDAHOME)
>>> a = torch.randn(5, requires_grad=True, device="cuda")
>>> b = torch.randn(5, requires_grad=True, device="cuda")
>>> c = torch.randn(5, requires_grad=True, device="cuda")

View File

@ -118,11 +118,12 @@ class profile(object):
please use ``use_cuda = False`` or ``num_workers = 0``.
Example:
>>> # xdoctest: +SKIP
>>> x = torch.randn((1, 1), requires_grad=True)
>>> with torch.autograd.profiler.profile() as prof:
>>> for _ in range(100): # any normal python code, really!
>>> y = x ** 2
>> y.backward()
>>> y.backward()
>>> # NOTE: some columns were removed for brevity
>>> print(prof.key_averages().table(sort_by="self_cpu_time_total"))
----------------------------------- --------------- --------------- ---------------
@ -443,6 +444,7 @@ class record_function(ContextDecorator):
... z = y ** 3
... y.backward()
...
>>> # xdoctest: +IGNORE_WANT
>>> # NOTE: some columns were removed for brevity
>>> print(prof.key_averages().table(sort_by="self_cpu_time_total"))
----------------------------------- --------------- --------------- ---------------
@ -535,6 +537,7 @@ class emit_itt(object):
Default: ``False``
Example:
>>> # xdoctest: +SKIP("Undefined variables")
>>> with torch.autograd.profiler.emit_itt():
... model(x)
@ -602,6 +605,7 @@ class emit_nvtx(object):
Default: ``False``
Example:
>>> # xdoctest: +SKIP("undefined variables")
>>> with torch.cuda.profiler.profile():
... model(x) # Warmup CUDA memory allocator and profiler
... with torch.autograd.profiler.emit_nvtx():

View File

@ -35,6 +35,7 @@ For example, you can the torch.linalg.inv function will raise torch.linalg.LinAl
a matrix is not invertible.\n \
\n\
Example:\n \
>>> # xdoctest: +REQUIRES(--lapac)\n \
>>> matrix = torch.eye(3, 3)\n \
>>> matrix[-1, -1] = 0\n \
>>> matrix\n \

View File

@ -126,6 +126,7 @@ def load_state_dict(
None.
Examples
>>> # xdoctest: +SKIP
>>> my_model = MyModule()
>>> optimizer = Adagrad(my_model.parameters())
>>> model_state_dict = my_model.state_dict()

View File

@ -128,6 +128,7 @@ def save_state_dict(
no_dist (bool): Don't attempt to save in SPMD style. Default to False
Example:
>>> # xdoctest: +SKIP
>>> my_model = MyModule()
>>> # We must call this function prior to state_dict()
>>> my_model._register_state_dict_hook(state_dict_hook)

View File

@ -60,6 +60,7 @@ class _PartialTensor(torch.Tensor):
Examples:
>>> # All tensors below are of torch.int64 type.
>>> # We have 2 process groups, 2 ranks.
>>> # xdoctest: +SKIP
>>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
>>> tensor = torch.cat([tensor, tensor + 2])
>>> tensor

View File

@ -30,6 +30,7 @@ def named_params_with_sharded_tensor(
Example::
>>> # xdoctest: +SKIP
>>> model = torch.nn.Linear(*linear_size)
>>> shard_parameter(model, "weight", spec)
>>> for name, param in named_params_with_sharded_tensor(model):

View File

@ -363,22 +363,24 @@ def init_from_local_shards(
Examples:
Suppose we want construct a sharded tensor on two ranks, global size = (10, 5),
each shard have a (5, 5) local tensor, we can do it like below:
Suppose we want construct a sharded tensor on two ranks, global size = (10, 5),
each shard have a (5, 5) local tensor, we can do it like below:
on rank 0:
on rank 0:
>>> # xdoctest: +SKIP("not distributed")
>>> local_shard_metadata = ShardMetadata(
>>> shard_offsets=[0, 0]
>>> shard_lengths=[5, 5]
>>> shard_offsets=[0, 0],
>>> shard_lengths=[5, 5],
>>> placement="rank:0/cuda:0"
>>> )
>>> local_shards = [Shard(torch.randn(5, 5), local_shard_metadata)]
>>> sharded_tensor = init_from_local_shards(local_shards, [10, 5])
on rank 1:
on rank 1:
>>> # xdoctest: +SKIP("not distributed")
>>> local_shard_metadata = ShardMetadata(
>>> shard_offsets=[5, 0]
>>> shard_lengths=[5, 5]
>>> shard_offsets=[5, 0],
>>> shard_lengths=[5, 5],
>>> placement="rank:1/cuda:1"
>>> )
>>> local_shards = [Shard(torch.randn(5, 5), local_shard_metadata)]
@ -427,8 +429,8 @@ def custom_sharded_op_impl(func):
Example::
>>> @custom_sharded_op_impl(torch.nn.functional.linear)
>>> def my_custom_sharded_linear(types, args, kwargs, process_group):
>>> ....
>>>
>>> ...
>>> # xdoctest: +SKIP("Undefined variables")
>>> input = torch.rand(10, 32)
>>> weight = sharded_tensor.rand(32, 16)
>>> bias = torch.rand(16)

View File

@ -12,11 +12,12 @@ def _sharded_op_common(op, early_stop_func, extra_check):
different behaviors are done on either local shards or a local tensor.
Example::
>>> # xdoctest: +SKIP("Undefined variables")
>>> op = torch.transpose
>>> @_sharded_op_impl(op)
>>> @_sharded_op_common(op, early_stop_func, extra_check)
>>> def sharded_tensor_op(types, args, kwargs, process_group):
>>> ....
>>> ...
>>>
>>> st = sharded_tensor.rand(32, 16)
>>> st.transpose(1, 2)

View File

@ -801,6 +801,7 @@ class ShardedTensor(ShardedTensorBase):
Examples:
>>> # All tensors below are of torch.int64 type.
>>> # We have 2 process groups, 2 ranks.
>>> # xdoctest: +SKIP
>>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
>>> local_tensor = torch.unsqueeze(torch.cat([tensor, tensor + 2]))
>>> local_tensor
@ -949,6 +950,7 @@ class ShardedTensor(ShardedTensorBase):
Examples:
>>> # We have 2 process groups, 2 ranks.
>>> # xdoctest: +SKIP
>>> tensor = torch.arange(4, dtype=torch.int64) + 1 + 2 * rank
>>> tensor = torch.stack([tensor, tensor])
>>> tensor

View File

@ -48,6 +48,7 @@ class ShardingPlan(object):
>>> return self.relu(self.fc2(self.gelu(self.fc1(input))))
>>> # xdoctest: +SKIP("Undefined spec1, spec2)
>>> sharding_plan = ShardingPlan(
>>> plan={
>>> "fc1.weight": spec1,

View File

@ -97,6 +97,7 @@ def register_ddp_comm_hook(
Uses Python comm hook implementations.
Example::
>>> # xdoctest: +SKIP
>>> register_ddp_comm_hook(DDPCommHookType.FP16_COMPRESS, model, state)
"""
comm_hook_type.value(model=model, state=state)

View File

@ -19,6 +19,7 @@ def noop_hook(_: Any, bucket: GradBucket) -> torch.futures.Future[torch.Tensor]:
some factors such as the overlap between allreduce and computation or the desynchronization across ranks.
Example::
>>> # xdoctest: +SKIP
>>> ddp_model.register_comm_hook(None, noop_hook)
"""
fut: torch.futures.Future[torch.Tensor] = torch.futures.Future()

View File

@ -33,6 +33,7 @@ def allreduce_hook(
unaffecting DDP behavior.
Example::
>>> # xdoctest: +SKIP
>>> ddp_model.register_comm_hook(process_group, allreduce_hook)
"""
return _allreduce_fut(process_group, bucket.buffer())
@ -49,6 +50,7 @@ def fp16_compress_hook(
tensors are allreduced, the chained callback ``decompress`` casts it back to the input data type (such as ``float32``).
Example::
>>> # xdoctest: +SKIP
>>> ddp_model.register_comm_hook(process_group, fp16_compress_hook)
"""
group_to_use = process_group if process_group is not None else dist.group.WORLD
@ -84,6 +86,7 @@ def bf16_compress_hook(
tensors are allreduced, the chained callback ``decompress`` casts it back to the input data type (such as ``float32``).
Example::
>>> # xdoctest: +SKIP
>>> ddp_model.register_comm_hook(process_group, bf16_compress_hook)
"""
group_to_use = process_group if process_group is not None else dist.group.WORLD
@ -116,6 +119,7 @@ def fp16_compress_wrapper(
Therefore, ``fp16_compress_hook`` is equivalent to ``fp16_compress_wrapper(allreduce_hook)``.
Example::
>>> # xdoctest: +SKIP
>>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
>>> ddp_model.register_comm_hook(state, fp16_compress_wrapper(powerSGD_hook))
"""
@ -153,6 +157,7 @@ def bf16_compress_wrapper(
Therefore, ``bf16_compress_hook`` is equivalent to ``bf16_compress_wrapper(allreduce_hook)``.
Example::
>>> # xdoctest: +SKIP
>>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
>>> ddp_model.register_comm_hook(state, bf16_compress_wrapper(powerSGD_hook))
"""

View File

@ -82,6 +82,7 @@ def post_localSGD_hook(
Future handler of the communication, which updates the gradients in place.
Example::
>>> # xdoctest: +SKIP
>>> state = PostLocalSGDState(process_group=process_group, subgroup=subgroup,
start_localSGD_iter=10)
>>> ddp_model.register_comm_hook(state, post_localSGD_hook)

View File

@ -379,6 +379,7 @@ def powerSGD_hook(
Future handler of the communication, which updates the gradients in place.
Example::
>>> # xdoctest: +SKIP
>>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1,
start_powerSGD_iter=10, min_compression_rate=0.5)
>>> ddp_model.register_comm_hook(state, powerSGD_hook)
@ -687,6 +688,7 @@ def batched_powerSGD_hook(
Future handler of the communication, which updates the gradients in place.
Example::
>>> # xdoctest: +SKIP
>>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1)
>>> ddp_model.register_comm_hook(state, batched_powerSGD_hook)
""" # noqa: B950

View File

@ -59,6 +59,7 @@ def quantization_pertensor_hook(
``allreduce`` protocol. It works only with flattened grads.
Example::
>>> # xdoctest: +SKIP
>>> ddp_model.register_comm_hook(process_group, quantization_pertensor_hook)
"""
group_to_use = process_group if process_group is not None else dist.group.WORLD
@ -138,6 +139,7 @@ def quantization_perchannel_hook(
``allreduce`` protocol. It works only with flattened grads.
Example::
>>> # xdoctest: +SKIP
>>> ddp_model.register_comm_hook(process_group, quantization_perchannel_hook)
"""
group_to_use = process_group if process_group is not None else dist.group.WORLD

View File

@ -150,6 +150,7 @@ class Join():
>>> import torch
>>> import torch.distributed as dist
>>> import torch.multiprocessing as mp
>>> # xdoctest: +SKIP
>>> import torch.nn.parallel.DistributedDataParallel as DDP
>>> import torch.distributed.optim.ZeroRedundancyOptimizer as ZeRO
>>> from torch.distributed.algorithms.join import Join

View File

@ -49,35 +49,36 @@ class PeriodicModelAverager(ModelAverager):
Example::
>>> import torch
>>> import torch.distributed as dist
>>> import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
>>> import torch.distributed.algorithms.model_averaging.averagers as averagers
>>> import torch.nn as nn
>>> # xdoctest: +SKIP("undefined variables")
>>> import torch
>>> import torch.distributed as dist
>>> import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
>>> import torch.distributed.algorithms.model_averaging.averagers as averagers
>>> import torch.nn as nn
>>>
>>> dist.init_process_group("nccl", rank=rank, world_size=16)
>>> torch.cuda.set_device(rank)
>>> module = nn.Linear(1, 1, bias=False).cuda()
>>> model = nn.parallel.DistributedDataParallel(
>>> module, device_ids=[rank], output_device=rank
>>> )
>>> # Register a post-localSGD communication hook.
>>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
>>> model.register_comm_hook(state, post_localSGD_hook)
>>> dist.init_process_group("nccl", rank=rank, world_size=16)
>>> torch.cuda.set_device(rank)
>>> module = nn.Linear(1, 1, bias=False).cuda()
>>> model = nn.parallel.DistributedDataParallel(
>>> module, device_ids=[rank], output_device=rank
>>> )
>>> # Register a post-localSGD communication hook.
>>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
>>> model.register_comm_hook(state, post_localSGD_hook)
>>>
>>> # In the first 100 steps, run global gradient averaging like normal DDP at every step.
>>> # After 100 steps, run model averaging every 4 steps.
>>> # Note that ``warmup_steps`` must be the same as ``start_localSGD_iter`` used in ``PostLocalSGDState``.
>>> averager = averagers.PeriodicModelAverager(period=4, warmup_steps=100)
>>> for step in range(0, 200):
>>> optimizer.zero_grad()
>>> loss = loss_fn(output, labels)
>>> loss.backward()
>>> optimizer.step()
>>> # Will average model parameters globally every 4 steps. Thus,
>>> # inter-node communication only occurs every 4 iterations after
>>> # the initial ``warmup_steps`` period.
>>> averager.average_parameters(model.parameters())
>>> # In the first 100 steps, run global gradient averaging like normal DDP at every step.
>>> # After 100 steps, run model averaging every 4 steps.
>>> # Note that ``warmup_steps`` must be the same as ``start_localSGD_iter`` used in ``PostLocalSGDState``.
>>> averager = averagers.PeriodicModelAverager(period=4, warmup_steps=100)
>>> for step in range(0, 200):
>>> optimizer.zero_grad()
>>> loss = loss_fn(output, labels)
>>> loss.backward()
>>> optimizer.step()
>>> # Will average model parameters globally every 4 steps. Thus,
>>> # inter-node communication only occurs every 4 iterations after
>>> # the initial ``warmup_steps`` period.
>>> averager.average_parameters(model.parameters())
"""
def __init__(

View File

@ -47,43 +47,44 @@ class HierarchicalModelAverager(averagers.ModelAverager):
(default: ``None``)
Example::
>>> from collections import OrderedDict
>>> import torch
>>> import torch.distributed as dist
>>> from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
>>> PostLocalSGDState,
>>> post_localSGD_hook,
>>> )
>>> import torch.distributed.algorithms.model_averaging.hierarchical_model_averager as hierarchicalSGD
>>> import torch.nn as nn
>>> # xdoctest: +SKIP('undefined rank')
>>> from collections import OrderedDict
>>> import torch
>>> import torch.distributed as dist
>>> from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
>>> PostLocalSGDState,
>>> post_localSGD_hook,
>>> )
>>> import torch.distributed.algorithms.model_averaging.hierarchical_model_averager as hierarchicalSGD
>>> import torch.nn as nn
>>>
>>> dist.init_process_group("nccl", rank=rank, world_size=16)
>>> torch.cuda.set_device(rank)
>>> module = nn.Linear(1, 1, bias=False).to(rank)
>>> model = nn.parallel.DistributedDataParallel(
>>> module, device_ids=[rank], output_device=rank
>>> )
>>> # Register a post-localSGD communication hook.
>>> # Assume that each machine has 4 GPUs, then each intra-machine subgroup has a size of 4.
>>> subgroup, _ = dist.new_subgroups()
>>> state = PostLocalSGDState(subgroup=subgroup, start_localSGD_iter=100)
>>> model.register_comm_hook(state, post_localSGD_hook)
>>> dist.init_process_group("nccl", rank=rank, world_size=16)
>>> torch.cuda.set_device(rank)
>>> module = nn.Linear(1, 1, bias=False).to(rank)
>>> model = nn.parallel.DistributedDataParallel(
>>> module, device_ids=[rank], output_device=rank
>>> )
>>> # Register a post-localSGD communication hook.
>>> # Assume that each machine has 4 GPUs, then each intra-machine subgroup has a size of 4.
>>> subgroup, _ = dist.new_subgroups()
>>> state = PostLocalSGDState(subgroup=subgroup, start_localSGD_iter=100)
>>> model.register_comm_hook(state, post_localSGD_hook)
>>>
>>> # Average parameters among each group of 8 processes every 4 iterations, and among all
>>> # the 16 processes every 16 iterations.
>>> averager = hierarchicalSGD.HierarchicalModelAverager(
>>> period_group_size_dict=OrderedDict([(4, 8), (16, 16)]), warmup_steps=100)
>>> # Note that ``warmup_steps`` must be the same as ``start_localSGD_iter`` used in ``PostLocalSGDState``.
>>> # In the first 100 steps, run global gradient averaging like normal DDP at every step.
>>> # After 100 steps, run model averaging at two levels.
>>> for step in range(0, 200):
>>> optimizer.zero_grad()
>>> loss = loss_fn(output, labels)
>>> loss.backward()
>>> optimizer.step()
>>> # Average parameters after ``optimizer.step()``.
>>> # Thus, the inter-node communication only occurs periodically after ``warmup_steps``.
>>> averager.average_parameters(model.parameters())
>>> # Average parameters among each group of 8 processes every 4 iterations, and among all
>>> # the 16 processes every 16 iterations.
>>> averager = hierarchicalSGD.HierarchicalModelAverager(
>>> period_group_size_dict=OrderedDict([(4, 8), (16, 16)]), warmup_steps=100)
>>> # Note that ``warmup_steps`` must be the same as ``start_localSGD_iter`` used in ``PostLocalSGDState``.
>>> # In the first 100 steps, run global gradient averaging like normal DDP at every step.
>>> # After 100 steps, run model averaging at two levels.
>>> for step in range(0, 200):
>>> optimizer.zero_grad()
>>> loss = loss_fn(output, labels)
>>> loss.backward()
>>> optimizer.step()
>>> # Average parameters after ``optimizer.step()``.
>>> # Thus, the inter-node communication only occurs periodically after ``warmup_steps``.
>>> averager.average_parameters(model.parameters())
.. warning ::
The last group size in the dict must be the size of the provided ``process_group``,

View File

@ -36,6 +36,7 @@ class context(object):
Example::
>>> import torch.distributed.autograd as dist_autograd
>>> # xdoctest: +SKIP
>>> with dist_autograd.context() as context_id:
>>> t1 = torch.rand((3, 3), requires_grad=True)
>>> t2 = torch.rand((3, 3), requires_grad=True)

View File

@ -1125,6 +1125,7 @@ def batch_isend_irecv(p2p_op_list):
op in the op_list.
Examples:
>>> # xdoctest: +SKIP("no rank")
>>> send_tensor = torch.arange(2) + 2 * rank
>>> recv_tensor = torch.randn(2)
>>> send_op = dist.P2POp(dist.isend, send_tensor, (rank + 1)%world_size)
@ -1338,6 +1339,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False):
None, if not async_op or if not part of the group
Examples:
>>> # xdoctest: +SKIP("no rank")
>>> # All tensors below are of torch.int64 type.
>>> # We have 2 process groups, 2 ranks.
>>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
@ -1680,6 +1682,7 @@ def all_gather_object(object_list, obj, group=None):
function with data you trust.
Example::
>>> # xdoctest: +SKIP("need process group init")
>>> # Note: Process group initialization omitted on each rank.
>>> import torch.distributed as dist
>>> # Assumes world_size of 3.
@ -1766,16 +1769,17 @@ def gather_object(obj, object_gather_list=None, dst=0, group=None):
function with data you trust.
Example::
>>> # xdoctest: +SKIP("need process group init")
>>> # Note: Process group initialization omitted on each rank.
>>> import torch.distributed as dist
>>> # Assumes world_size of 3.
>>> gather_objects = ["foo", 12, {1: 2}] # any picklable object
>>> output = [None for _ in gather_objects]
>>> dist.gather_object(
gather_objects[dist.get_rank()],
output if dist.get_rank() == 0 else None,
dst=0
)
... gather_objects[dist.get_rank()],
... output if dist.get_rank() == 0 else None,
... dst=0
... )
>>> # On rank 0
>>> output
['foo', 12, {1: 2}]
@ -1871,6 +1875,7 @@ def broadcast_object_list(object_list, src=0, group=None, device=None):
function with data you trust.
Example::
>>> # xdoctest: +SKIP("need process group init")
>>> # Note: Process group initialization omitted on each rank.
>>> import torch.distributed as dist
>>> if dist.get_rank() == 0:
@ -1968,6 +1973,7 @@ def scatter_object_list(
function with data you trust.
Example::
>>> # xdoctest: +SKIP("need process group init")
>>> # Note: Process group initialization omitted on each rank.
>>> import torch.distributed as dist
>>> if dist.get_rank() == 0:
@ -2053,6 +2059,7 @@ def all_gather(tensor_list, tensor, group=None, async_op=False):
None, if not async_op or if not part of the group
Examples:
>>> # xdoctest: +SKIP("need process group init")
>>> # All tensors below are of torch.int64 dtype.
>>> # We have 2 process groups, 2 ranks.
>>> tensor_list = [torch.zeros(2, dtype=torch.int64) for _ in range(2)]
@ -2122,6 +2129,7 @@ def _all_gather_base(output_tensor, input_tensor, group=None, async_op=False):
None, if not async_op or if not part of the group
Examples:
>>> # xdoctest: +SKIP("need process group init")
>>> # All tensors below are of torch.int64 dtype.
>>> # We have 2 process groups, 2 ranks.
>>> output_tensor = torch.zeros(2, dtype=torch.int64)
@ -2563,6 +2571,7 @@ def all_to_all_single(
`all_to_all_single` is experimental and subject to change.
Examples:
>>> # xdoctest: +SKIP("Undefined rank")
>>> input = torch.arange(4) + rank * 4
>>> input
tensor([0, 1, 2, 3]) # Rank 0
@ -2678,6 +2687,7 @@ def all_to_all(output_tensor_list, input_tensor_list, group=None, async_op=False
`all_to_all` is experimental and subject to change.
Examples:
>>> # xdoctest: +SKIP("Undefined rank")
>>> input = torch.arange(4) + rank * 4
>>> input = list(input.chunk(4))
>>> input
@ -2858,6 +2868,7 @@ def monitored_barrier(group=GroupMember.WORLD, timeout=None, wait_all_ranks=Fals
``None``.
Example::
>>> # xdoctest: +SKIP("need process group init")
>>> # Note: Process group initialization omitted on each rank.
>>> import torch.distributed as dist
>>> if dist.get_rank() != 1:
@ -3115,6 +3126,7 @@ def new_subgroups(
Examples:
>>> # Create intra-machine subgroups.
>>> # xdoctest: +SKIP("need process group init")
>>> cur_subgroup, subgroups = dist.new_subgroups()
>>> # Allreduce within the machine.
>>> rank = dist.get_rank()
@ -3229,6 +3241,7 @@ def new_subgroups_by_enumeration(
Examples:
>>> # Create two subgroups, where each has 2 processes.
>>> # xdoctest: +SKIP("need process group init")
>>> cur_subgroup, subgroups = dist.new_subgroups(ranks=[[0, 2], [1, 3]])
>>> rank = dist.get_rank()
>>> tensor = torch.ones(1, device=rank) * rank

View File

@ -293,8 +293,9 @@ class StateDictType(Enum):
meaningful to FSDP (because parameters are flattened). Note that
these APIs are meant for use via the :func:`state_dict_type`
context manager as follows:
>>> # xdoctest: +SKIP("undefined variables")
>>> with fsdp.state_dict_type(StateDictType.LOCAL_STATE_DICT):
>>> state = fsdp.state_dict() # loads local state dict
... state = fsdp.state_dict() # loads local state dict
3. ``_sharded_state_dict/_load_sharded_state_dict``: this pair of APIs
return and load sharded, unflattened parameters. The ``state_dict``
return by ``sharded_state_dict`` can be used by all other parallel
@ -326,6 +327,7 @@ class FullStateDictConfig(StateDictConfig):
together to optimize memory savings when taking checkpoints. Note that
this config class is meant for user via the :func:`state_dict_type`
context manager as follows:
>>> # xdoctest: +SKIP("undefined variables")
>>> fsdp = FSDP(model, auto_wrap_policy=...)
>>> cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
>>> with FullyShardedDataParallel.state_dict_type(fsdp, StateDictType.FULL_STATE_DICT, cfg):
@ -470,6 +472,7 @@ class FullyShardedDataParallel(nn.Module):
Example::
>>> # xdoctest: +SKIP("undefined variables")
>>> import torch
>>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
>>> torch.cuda.set_device(device_id)
@ -623,9 +626,11 @@ class FullyShardedDataParallel(nn.Module):
Example::
>>> # xdoctest: +SKIP("undefined variables")
>>> module = MyModule(device="meta")
>>> def my_init_fn(module):
>>> # responsible for initializing a module, such as with reset_parameters
>>> ...
>>> fsdp_model = FSDP(module, param_init_fn=my_init_fn, auto_wrap_policy=size_based_auto_wrap_policy)
>>> print(next(fsdp_model.parameters()).device) # current CUDA device
>>> # With torchdistX
@ -1806,9 +1811,10 @@ class FullyShardedDataParallel(nn.Module):
Example::
>>> model = DDP(FSDP(...))
>>> with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
>>> checkpoint = model.state_dict()
>>> # xdoctest: +SKIP("undefined variables")
>>> model = DDP(FSDP(...))
>>> with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
>>> checkpoint = model.state_dict()
Args:
module (torch.nn.Module): Root module.
@ -2051,22 +2057,23 @@ class FullyShardedDataParallel(nn.Module):
Example::
>>> import torch
>>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
>>> from torch.distributed.fsdp import StateDictType
>>> torch.cuda.set_device(device_id)
>>> my_module = nn.Linear(...)
>>> sharded_module = FSDP(my_module)
>>> full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
>>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT, full_state_dict_config):
>>> full_dict = sharded_module.state_dict()
>>> full_dict.keys()
>>> odict_keys(['weight', 'bias'])
>>> # using local state dict
>>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT):
>>> local_dict = sharded_module.state_dict()
>>> local_dict.keys()
>>> odict_keys(['flat_param', 'inner.flat_param'])
>>> # xdoctest: +SKIP("undefined variables")
>>> import torch
>>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
>>> from torch.distributed.fsdp import StateDictType
>>> torch.cuda.set_device(device_id)
>>> my_module = nn.Linear(...)
>>> sharded_module = FSDP(my_module)
>>> full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
>>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT, full_state_dict_config):
>>> full_dict = sharded_module.state_dict()
>>> full_dict.keys()
>>> odict_keys(['weight', 'bias'])
>>> # using local state dict
>>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT):
>>> local_dict = sharded_module.state_dict()
>>> local_dict.keys()
>>> odict_keys(['flat_param', 'inner.flat_param'])
.. warning:: This needs to be called on all ranks, since synchronization
primitives may be used.
@ -2332,24 +2339,25 @@ class FullyShardedDataParallel(nn.Module):
Example::
>>> import torch
>>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
>>> from torch.distributed.fsdp import StateDictType
>>> torch.cuda.set_device(device_id)
>>> my_module = nn.Linear(...)
>>> sharded_module = FSDP(my_module)
>>> checkpoint = torch.load(PATH)
>>> full_state_dict = checkpoint['full_state_dict']
>>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT):
>>> sharded_module.load_state_dict(full_state_dict)
>>> full_dict.keys()
>>> odict_keys(['weight', 'bias'])
>>> # using local state dict
>>> local_state_dict = checkpoint['local_state_dict']
>>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT):
>>> sharded_module.load_state_dict(local_state_dict)
>>> local_dict.keys()
>>> odict_keys(['flat_param', 'inner.flat_param'])
>>> # xdoctest: +SKIP("undefined variables")
>>> import torch
>>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
>>> from torch.distributed.fsdp import StateDictType
>>> torch.cuda.set_device(device_id)
>>> my_module = nn.Linear(...)
>>> sharded_module = FSDP(my_module)
>>> checkpoint = torch.load(PATH)
>>> full_state_dict = checkpoint['full_state_dict']
>>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT):
>>> sharded_module.load_state_dict(full_state_dict)
>>> full_dict.keys()
>>> odict_keys(['weight', 'bias'])
>>> # using local state dict
>>> local_state_dict = checkpoint['local_state_dict']
>>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT):
>>> sharded_module.load_state_dict(local_state_dict)
>>> local_dict.keys()
>>> odict_keys(['flat_param', 'inner.flat_param'])
.. warning:: This needs to be called on all ranks, since synchronization
primitives may be used.
@ -3841,6 +3849,7 @@ class FullyShardedDataParallel(nn.Module):
Example::
>>> # xdoctest: +SKIP("undefined variables")
>>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
>>> model, optim = ...
>>> full_osd = FSDP.full_optim_state_dict(model, optim)
@ -3908,6 +3917,7 @@ class FullyShardedDataParallel(nn.Module):
Example::
>>> # xdoctest: +SKIP("undefined variables")
>>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
>>> model, optim = ...
>>> full_osd = FSDP.full_optim_state_dict(model, optim) # only non-empty on rank 0
@ -4004,6 +4014,7 @@ class FullyShardedDataParallel(nn.Module):
:meth:`full_optim_state_dict`) to use parameter IDs and be loadable to
a non-wrapped model::
>>> # xdoctest: +SKIP("undefined variables")
>>> wrapped_model, wrapped_optim = ...
>>> full_osd = FSDP.full_optim_state_dict(wrapped_model, wrapped_optim)
>>> nonwrapped_model, nonwrapped_optim = ...
@ -4013,6 +4024,7 @@ class FullyShardedDataParallel(nn.Module):
To re-key a normal optimizer state dict from a non-wrapped model to be
loadable to a wrapped model::
>>> # xdoctest: +SKIP("undefined variables")
>>> nonwrapped_model, nonwrapped_optim = ...
>>> osd = nonwrapped_optim.state_dict()
>>> rekeyed_osd = FSDP.rekey_optim_state_dict(osd, OptimStateKeyType.PARAM_NAME, nonwrapped_model)

View File

@ -30,7 +30,7 @@ GPU (nproc_per_node - 1)*.
::
>>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
arguments of your training script)
@ -41,7 +41,7 @@ Node 1: *(IP: 192.168.1.1, and has a free port: 1234)*
::
>>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
--nnodes=2 --node_rank=0 --master_addr="192.168.1.1"
--master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
and all other arguments of your training script)
@ -50,7 +50,7 @@ Node 2:
::
>>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
--nnodes=2 --node_rank=1 --master_addr="192.168.1.1"
--master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
and all other arguments of your training script)
@ -59,7 +59,7 @@ Node 2:
::
>>> python -m torch.distributed.launch --help
python -m torch.distributed.launch --help
**Important Notices:**
@ -78,6 +78,7 @@ Parsing the local_rank argument
::
>>> # xdoctest: +SKIP
>>> import argparse
>>> parser = argparse.ArgumentParser()
>>> parser.add_argument("--local_rank", type=int)
@ -95,6 +96,7 @@ or
>>> with torch.cuda.device(args.local_rank):
>>> # your code to run
>>> ...
3. In your training program, you are supposed to call the following function
at the beginning to start the distributed backend. It is strongly recommended
@ -103,8 +105,8 @@ but ``env://`` is the one that is officially supported by this module.
::
torch.distributed.init_process_group(backend='YOUR BACKEND',
init_method='env://')
>>> torch.distributed.init_process_group(backend='YOUR BACKEND',
>>> init_method='env://')
4. In your training program, you can either use regular distributed functions
or use :func:`torch.nn.parallel.DistributedDataParallel` module. If your
@ -114,9 +116,9 @@ here is how to configure it.
::
model = torch.nn.parallel.DistributedDataParallel(model,
device_ids=[args.local_rank],
output_device=args.local_rank)
>>> model = torch.nn.parallel.DistributedDataParallel(model,
>>> device_ids=[args.local_rank],
>>> output_device=args.local_rank)
Please ensure that ``device_ids`` argument is set to be the only GPU device id
that your code will be operating on. This is generally the local rank of the

View File

@ -199,6 +199,7 @@ class _RemoteModule(nn.Module):
>>> from torch import nn, Tensor
>>> from torch.distributed.nn.api.remote_module import RemoteModule
>>>
>>> # xdoctest: +SKIP
>>> rpc.init_rpc("worker0", rank=0, world_size=2)
>>> remote_linear_module = RemoteModule(
>>> "worker1/cpu", nn.Linear, args=(20, 30),
@ -505,6 +506,7 @@ class _RemoteModule(nn.Module):
>>> from torch import nn, Tensor
>>> from torch.distributed.nn.api.remote_module import RemoteModule
>>>
>>> # xdoctest: +SKIP
>>> rpc.init_rpc("worker0", rank=0, world_size=2)
>>> remote_module = RemoteModule(
>>> "worker1/cpu", nn.Linear, args=(20, 30),
@ -626,6 +628,7 @@ class RemoteModule(_RemoteModule):
>>> from torch import nn, Tensor
>>> from torch.distributed.nn.api.remote_module import RemoteModule
>>>
>>> # xdoctest: +SKIP
>>> rpc.init_rpc("worker0", rank=0, world_size=2)
>>> remote_linear_module = RemoteModule(
>>> "worker1/cpu", nn.Linear, args=(20, 30),

View File

@ -134,6 +134,7 @@ def _all_gather_base(output_tensor, input_tensor, group=group.WORLD):
Examples:
>>> # All tensors below are of torch.int64 dtype.
>>> # We have 2 process groups, 2 ranks.
>>> # xdoctest: +SKIP("incorrect want text")
>>> output_tensor = torch.zeros(2, dtype=torch.int64)
>>> output_tensor
[tensor([0, 0])] # Rank 0 and 1

View File

@ -168,6 +168,7 @@ class DistributedOptimizer:
>>> from torch import optim
>>> from torch.distributed.optim import DistributedOptimizer
>>>
>>> # xdoctest: +SKIP
>>> with dist_autograd.context() as context_id:
>>> # Forward pass.
>>> rref1 = rpc.remote("worker1", torch.add, args=(torch.ones(2), 3))

View File

@ -15,41 +15,42 @@ class PostLocalSGDOptimizer(torch.optim.Optimizer):
Example::
>>> import torch
>>> import torch.distributed as dist
>>> import torch.distributed.algorithms.model_averaging.averagers as averagers
>>> import torch.nn as nn
>>> from torch.distributed.optim import PostLocalSGDOptimizer
>>> from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
>>> PostLocalSGDState,
>>> post_localSGD_hook,
>>> )
>>> # xdoctest: +SKIP("undefined variables")
>>> import torch
>>> import torch.distributed as dist
>>> import torch.distributed.algorithms.model_averaging.averagers as averagers
>>> import torch.nn as nn
>>> from torch.distributed.optim import PostLocalSGDOptimizer
>>> from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
>>> PostLocalSGDState,
>>> post_localSGD_hook,
>>> )
>>>
>>> model = nn.parallel.DistributedDataParallel(
>>> module, device_ids=[rank], output_device=rank
>>> )
>>> model = nn.parallel.DistributedDataParallel(
>>> module, device_ids=[rank], output_device=rank
>>> )
>>>
>>> # Register a post-localSGD communication hook.
>>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
>>> model.register_comm_hook(state, post_localSGD_hook)
>>> # Register a post-localSGD communication hook.
>>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
>>> model.register_comm_hook(state, post_localSGD_hook)
>>>
>>> # Create a post-localSGD optimizer that wraps a local optimizer.
>>> # Note that ``warmup_steps`` used in ``PostLocalSGDOptimizer`` must be the same as
>>> # ``start_localSGD_iter`` used in ``PostLocalSGDState``.
>>> local_optim = torch.optim.SGD(params=model.parameters(), lr=0.01)
>>> opt = PostLocalSGDOptimizer(
>>> optim=local_optim,
>>> averager=averagers.PeriodicModelAverager(period=4, warmup_steps=100)
>>> )
>>> # Create a post-localSGD optimizer that wraps a local optimizer.
>>> # Note that ``warmup_steps`` used in ``PostLocalSGDOptimizer`` must be the same as
>>> # ``start_localSGD_iter`` used in ``PostLocalSGDState``.
>>> local_optim = torch.optim.SGD(params=model.parameters(), lr=0.01)
>>> opt = PostLocalSGDOptimizer(
>>> optim=local_optim,
>>> averager=averagers.PeriodicModelAverager(period=4, warmup_steps=100)
>>> )
>>>
>>> # In the first 100 steps, DDP runs global gradient averaging at every step.
>>> # After 100 steps, DDP runs gradient averaging within each subgroup (intra-node by default),
>>> # and post-localSGD optimizer runs global model averaging every 4 steps after applying the local optimizer.
>>> for step in range(0, 200):
>>> opt.zero_grad()
>>> loss = loss_fn(output, labels)
>>> loss.backward()
>>> opt.step()
>>> # In the first 100 steps, DDP runs global gradient averaging at every step.
>>> # After 100 steps, DDP runs gradient averaging within each subgroup (intra-node by default),
>>> # and post-localSGD optimizer runs global model averaging every 4 steps after applying the local optimizer.
>>> for step in range(0, 200):
>>> opt.zero_grad()
>>> loss = loss_fn(output, labels)
>>> loss.backward()
>>> opt.step()
"""
def __init__(

View File

@ -32,6 +32,7 @@ def register_functional_optim(key, optim):
need not be of :class:`torch.optim.Optimizer` (e.g. for custom optimizers)
Example::
>>> # import the new functional optimizer
>>> # xdoctest: +SKIP
>>> from xyz import fn_optimizer
>>> from torch.distributed.optim.utils import register_functional_optim
>>> fn_optim_key = "XYZ_optim"

View File

@ -331,6 +331,7 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable):
>>> from torch.distributed.optim import ZeroRedundancyOptimizer
>>> from torch.nn.parallel import DistributedDataParallel as DDP
>>> # xdoctest: +SKIP
>>> model = nn.Sequential(*[nn.Linear(2000, 2000).to(rank) for _ in range(20)])
>>> ddp = DDP(model, device_ids=[rank])
>>> opt = ZeroRedundancyOptimizer(

View File

@ -156,6 +156,7 @@ class WithDevice(nn.Module):
>>> # Dropout does not have any parameters/buffers, but we want to
>>> # run it on cuda:1 to avoid any GPU to CPU transfers.
>>> model = nn.Sequential(fc1, fc2, WithDevice(dropout, 'cuda:1'))
>>> # xdoctest: +SKIP
>>> model = Pipe(model, chunks=8)
"""
def __init__(self, module: nn.Module, device: torch.device):
@ -270,6 +271,7 @@ class Pipe(Module):
Pipeline of two FC layers across GPUs 0 and 1.
>>> # Need to initialize RPC framework first.
>>> # xdoctest: +SKIP
>>> os.environ['MASTER_ADDR'] = 'localhost'
>>> os.environ['MASTER_PORT'] = '29500'
>>> torch.distributed.rpc.init_rpc('worker', rank=0, world_size=1)

View File

@ -160,6 +160,7 @@ def _wait_all():
>>> # On worker 0:
>>> import torch
>>> import torch.distributed.rpc as rpc
>>> # xdoctest: +SKIP
>>> rpc.init_rpc("worker0", rank=0, world_size=2)
>>> with rpc._wait_all():
>>> fut_1 = rpc.rpc_async(dst, torch.add, (torch.ones(2, 2), 1))
@ -331,11 +332,12 @@ def shutdown(graceful=True, timeout=DEFAULT_SHUTDOWN_TIMEOUT):
on both workers. Refer to :meth:`~torch.distributed.init_process_group`
API for more details. For example,
>>> export MASTER_ADDR=localhost
>>> export MASTER_PORT=5678
export MASTER_ADDR=localhost
export MASTER_PORT=5678
Then run the following code in two different processes:
>>> # xdoctest: +SKIP
>>> # On worker 0:
>>> import torch
>>> import torch.distributed.rpc as rpc
@ -574,15 +576,17 @@ def remote(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
raised as they have not yet been handled.
Example::
Make sure that ``MASTER_ADDR`` and ``MASTER_PORT`` are set properly
on both workers. Refer to :meth:`~torch.distributed.init_process_group`
API for more details. For example,
>>> export MASTER_ADDR=localhost
>>> export MASTER_PORT=5678
export MASTER_ADDR=localhost
export MASTER_PORT=5678
Then run the following code in two different processes:
>>> # xdoctest: +SKIP
>>> # On worker 0:
>>> import torch
>>> import torch.distributed.rpc as rpc
@ -759,11 +763,12 @@ def rpc_sync(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
on both workers. Refer to :meth:`~torch.distributed.init_process_group`
API for more details. For example,
>>> export MASTER_ADDR=localhost
>>> export MASTER_PORT=5678
export MASTER_ADDR=localhost
export MASTER_PORT=5678
Then run the following code in two different processes:
>>> # xdoctest: +SKIP
>>> # On worker 0:
>>> import torch
>>> import torch.distributed.rpc as rpc
@ -850,11 +855,12 @@ def rpc_async(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
on both workers. Refer to :meth:`~torch.distributed.init_process_group`
API for more details. For example,
>>> export MASTER_ADDR=localhost
>>> export MASTER_PORT=5678
export MASTER_ADDR=localhost
export MASTER_PORT=5678
Then run the following code in two different processes:
>>> # xdoctest: +SKIP
>>> # On worker 0:
>>> import torch
>>> import torch.distributed.rpc as rpc

View File

@ -56,6 +56,7 @@ def async_execution(fn):
>>> )
>>>
>>> # On worker0
>>> # xdoctest: +SKIP
>>> ret = rpc.rpc_sync(
>>> "worker1",
>>> async_add_chained,

View File

@ -127,6 +127,7 @@ class TensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
>>> options.set_device_map("worker1", {1: 2})
>>> # maps worker0's cuda:1 to worker1's cuda:2
>>>
>>> # xdoctest: +SKIP
>>> rpc.init_rpc(
>>> "worker0",
>>> rank=0,

View File

@ -55,6 +55,7 @@ class _server_process_global_profile(profile):
please use ``use_cuda = False`` or ``num_workers = 0``.
Example:
>>> # xdoctest: +SKIP
>>> # On worker 0:
>>> import torch
>>> import torch.distributed.rpc as rpc
@ -67,7 +68,7 @@ class _server_process_global_profile(profile):
>>> inner_profile_rref.rpc_sync().__enter__()
>>> rpc.rpc_sync(dst_worker_name, torch.sub, (x, y))
>>> inner_profile_rref.rpc_sync().__exit__(None, None, None)
>>> outer_profile_rref.rpc_sync().__exit__(None, None, None
>>> outer_profile_rref.rpc_sync().__exit__(None, None, None)
>>> print(inner_profile_rref.rpc_sync().key_averages())
--------- --------------- --------------- --------------- --------------- --------------- ---------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg Number of Calls

View File

@ -82,7 +82,7 @@ Single-node multi-worker
::
>>> torchrun
torchrun
--standalone
--nnodes=1
--nproc_per_node=$NUM_TRAINERS
@ -101,7 +101,7 @@ port automatically instead of manually assgining different ports for each run.
::
>>> torchrun
torchrun
--rdzv_backend=c10d
--rdzv_endpoint=localhost:0
--nnodes=1
@ -114,7 +114,7 @@ Fault tolerant (fixed sized number of workers, no elasticity, tolerates 3 failur
::
>>> torchrun
torchrun
--nnodes=$NUM_NODES
--nproc_per_node=$NUM_TRAINERS
--max_restarts=3
@ -135,7 +135,7 @@ Elastic (``min=1``, ``max=4``, tolerates up to 3 membership changes or failures)
::
>>> torchrun
torchrun
--nnodes=1:4
--nproc_per_node=$NUM_TRAINERS
--max_restarts=3
@ -294,6 +294,7 @@ Important Notices
::
>>> # xdoctest: +SKIP("stub")
>>> import torch.distributed as dist
>>> dist.init_process_group(backend="gloo|nccl")

View File

@ -19,6 +19,7 @@ class Bernoulli(ExponentialFamily):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Bernoulli(torch.tensor([0.3]))
>>> m.sample() # 30% chance 1; 70% chance 0
tensor([ 0.])

View File

@ -14,6 +14,7 @@ class Beta(ExponentialFamily):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Beta(torch.tensor([0.5]), torch.tensor([0.5]))
>>> m.sample() # Beta distributed with concentration concentration1 and concentration0
tensor([ 0.1046])

View File

@ -18,6 +18,7 @@ class Binomial(Distribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Binomial(100, torch.tensor([0 , .2, .8, 1]))
>>> x = m.sample()
tensor([ 0., 22., 71., 100.])

View File

@ -35,6 +35,7 @@ class Categorical(Distribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Categorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
>>> m.sample() # equal probability of 0, 1, 2, 3
tensor(3)

View File

@ -17,6 +17,7 @@ class Cauchy(Distribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Cauchy(torch.tensor([0.0]), torch.tensor([1.0]))
>>> m.sample() # sample from a Cauchy distribution with loc=0 and scale=1
tensor([ 2.3214])

View File

@ -10,6 +10,7 @@ class Chi2(Gamma):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Chi2(torch.tensor([1.0]))
>>> m.sample() # Chi2 distributed with shape df=1
tensor([ 0.1046])

View File

@ -22,6 +22,7 @@ class ContinuousBernoulli(ExponentialFamily):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = ContinuousBernoulli(torch.tensor([0.3]))
>>> m.sample()
tensor([ 0.2538])

View File

@ -33,6 +33,7 @@ class Dirichlet(ExponentialFamily):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Dirichlet(torch.tensor([0.5, 0.5]))
>>> m.sample() # Dirichlet distributed with concentration [0.5, 0.5]
tensor([ 0.1046, 0.8954])

View File

@ -13,6 +13,7 @@ class Exponential(ExponentialFamily):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Exponential(torch.tensor([1.0]))
>>> m.sample() # Exponential distributed with rate=1
tensor([ 0.1046])

View File

@ -14,6 +14,7 @@ class FisherSnedecor(Distribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = FisherSnedecor(torch.tensor([1.0]), torch.tensor([2.0]))
>>> m.sample() # Fisher-Snedecor-distributed with df1=1 and df2=2
tensor([ 0.2453])

View File

@ -17,6 +17,7 @@ class Gamma(ExponentialFamily):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Gamma(torch.tensor([1.0]), torch.tensor([1.0]))
>>> m.sample() # Gamma distributed with concentration=1 and rate=1
tensor([ 0.1046])

View File

@ -19,6 +19,7 @@ class Geometric(Distribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Geometric(torch.tensor([0.3]))
>>> m.sample() # underlying Bernoulli has 30% chance 1; 70% chance 0
tensor([ 2.])

View File

@ -15,6 +15,7 @@ class Gumbel(TransformedDistribution):
Examples::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Gumbel(torch.tensor([1.0]), torch.tensor([2.0]))
>>> m.sample() # sample from Gumbel distribution with loc=1, scale=2
tensor([ 1.0124])

View File

@ -18,6 +18,7 @@ class HalfCauchy(TransformedDistribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = HalfCauchy(torch.tensor([1.0]))
>>> m.sample() # half-cauchy distributed with scale=1
tensor([ 2.3214])

View File

@ -18,6 +18,7 @@ class HalfNormal(TransformedDistribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = HalfNormal(torch.tensor([1.0]))
>>> m.sample() # half-normal distributed with scale=1
tensor([ 0.1046])

View File

@ -15,17 +15,19 @@ class Independent(Distribution):
the same shape as a Multivariate Normal distribution (so they are
interchangeable), you can::
>>> from torch.distributions.multivariate_normal import MultivariateNormal
>>> from torch.distributions.normal import Normal
>>> loc = torch.zeros(3)
>>> scale = torch.ones(3)
>>> mvn = MultivariateNormal(loc, scale_tril=torch.diag(scale))
>>> [mvn.batch_shape, mvn.event_shape]
[torch.Size(()), torch.Size((3,))]
[torch.Size([]), torch.Size([3])]
>>> normal = Normal(loc, scale)
>>> [normal.batch_shape, normal.event_shape]
[torch.Size((3,)), torch.Size(())]
[torch.Size([3]), torch.Size([])]
>>> diagn = Independent(normal, 1)
>>> [diagn.batch_shape, diagn.event_shape]
[torch.Size(()), torch.Size((3,))]
[torch.Size([]), torch.Size([3])]
Args:
base_distribution (torch.distributions.distribution.Distribution): a

View File

@ -23,6 +23,7 @@ class Kumaraswamy(TransformedDistribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Kumaraswamy(torch.tensor([1.0]), torch.tensor([1.0]))
>>> m.sample() # sample from a Kumaraswamy distribution with concentration alpha=1 and beta=1
tensor([ 0.1729])

View File

@ -12,6 +12,7 @@ class Laplace(Distribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Laplace(torch.tensor([0.0]), torch.tensor([1.0]))
>>> m.sample() # Laplace distributed with loc=0, scale=1
tensor([ 0.1046])

View File

@ -34,6 +34,7 @@ class LKJCholesky(Distribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> l = LKJCholesky(3, 0.5)
>>> l.sample() # l @ l.T is a sample of a correlation 3x3 matrix
tensor([[ 1.0000, 0.0000, 0.0000],

View File

@ -15,6 +15,7 @@ class LogNormal(TransformedDistribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = LogNormal(torch.tensor([0.0]), torch.tensor([1.0]))
>>> m.sample() # log-normal distributed with mean=0 and stddev=1
tensor([ 0.1046])

View File

@ -22,7 +22,8 @@ class LogisticNormal(TransformedDistribution):
>>> # logistic-normal distributed with mean=(0, 0, 0) and stddev=(1, 1, 1)
>>> # of the base Normal distribution
>>> m = distributions.LogisticNormal(torch.tensor([0.0] * 3), torch.tensor([1.0] * 3))
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = LogisticNormal(torch.tensor([0.0] * 3), torch.tensor([1.0] * 3))
>>> m.sample()
tensor([ 0.7653, 0.0341, 0.0579, 0.1427])

View File

@ -52,7 +52,7 @@ class LowRankMultivariateNormal(Distribution):
covariance_matrix = cov_factor @ cov_factor.T + cov_diag
Example:
>>> # xdoctest: +REQUIRES(--lapack)
>>> m = LowRankMultivariateNormal(torch.zeros(2), torch.tensor([[1.], [0.]]), torch.ones(2))
>>> m.sample() # normally distributed with mean=`[0,0]`, cov_factor=`[[1],[0]]`, cov_diag=`[1,1]`
tensor([-0.2102, -0.5429])

View File

@ -17,24 +17,25 @@ class MixtureSameFamily(Distribution):
Examples::
# Construct Gaussian Mixture Model in 1D consisting of 5 equally
# weighted normal distributions
>>> # xdoctest: +SKIP("undefined vars")
>>> # Construct Gaussian Mixture Model in 1D consisting of 5 equally
>>> # weighted normal distributions
>>> mix = D.Categorical(torch.ones(5,))
>>> comp = D.Normal(torch.randn(5,), torch.rand(5,))
>>> gmm = MixtureSameFamily(mix, comp)
# Construct Gaussian Mixture Modle in 2D consisting of 5 equally
# weighted bivariate normal distributions
>>> # Construct Gaussian Mixture Modle in 2D consisting of 5 equally
>>> # weighted bivariate normal distributions
>>> mix = D.Categorical(torch.ones(5,))
>>> comp = D.Independent(D.Normal(
torch.randn(5,2), torch.rand(5,2)), 1)
... torch.randn(5,2), torch.rand(5,2)), 1)
>>> gmm = MixtureSameFamily(mix, comp)
# Construct a batch of 3 Gaussian Mixture Models in 2D each
# consisting of 5 random weighted bivariate normal distributions
>>> # Construct a batch of 3 Gaussian Mixture Models in 2D each
>>> # consisting of 5 random weighted bivariate normal distributions
>>> mix = D.Categorical(torch.rand(3,5))
>>> comp = D.Independent(D.Normal(
torch.randn(3,5,2), torch.rand(3,5,2)), 1)
... torch.randn(3,5,2), torch.rand(3,5,2)), 1)
>>> gmm = MixtureSameFamily(mix, comp)
Args:

View File

@ -32,6 +32,7 @@ class Multinomial(Distribution):
Example::
>>> # xdoctest: +SKIP("FIXME: found invalid values")
>>> m = Multinomial(100, torch.tensor([ 1., 1., 1., 1.]))
>>> x = m.sample() # equal probability of 0, 1, 2, 3
tensor([ 21., 24., 30., 25.])

View File

@ -91,6 +91,7 @@ class MultivariateNormal(Distribution):
Example:
>>> # xdoctest: +REQUIRES(--lapack)
>>> m = MultivariateNormal(torch.zeros(2), torch.eye(2))
>>> m.sample() # normally distributed with mean=`[0,0]` and covariance_matrix=`I`
tensor([-0.2102, -0.5429])

View File

@ -16,6 +16,7 @@ class Normal(ExponentialFamily):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Normal(torch.tensor([0.0]), torch.tensor([1.0]))
>>> m.sample() # normally distributed with loc=0 and scale=1
tensor([ 0.1046])

View File

@ -25,6 +25,7 @@ class OneHotCategorical(Distribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = OneHotCategorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
>>> m.sample() # equal probability of 0, 1, 2, 3
tensor([ 0., 0., 0., 1.])

View File

@ -12,6 +12,7 @@ class Pareto(TransformedDistribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Pareto(torch.tensor([1.0]), torch.tensor([1.0]))
>>> m.sample() # sample from a Pareto distribution with scale=1 and alpha=1
tensor([ 1.5623])

View File

@ -18,6 +18,7 @@ class Poisson(ExponentialFamily):
Example::
>>> # xdoctest: +SKIP("poisson_cpu not implemented for 'Long'")
>>> m = Poisson(torch.tensor([4]))
>>> m.sample()
tensor([ 3.])

View File

@ -100,8 +100,9 @@ class RelaxedBernoulli(TransformedDistribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = RelaxedBernoulli(torch.tensor([2.2]),
torch.tensor([0.1, 0.2, 0.3, 0.99]))
... torch.tensor([0.1, 0.2, 0.3, 0.99]))
>>> m.sample()
tensor([ 0.2951, 0.3442, 0.8918, 0.9021])

View File

@ -94,8 +94,9 @@ class RelaxedOneHotCategorical(TransformedDistribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = RelaxedOneHotCategorical(torch.tensor([2.2]),
torch.tensor([0.1, 0.2, 0.3, 0.4]))
... torch.tensor([0.1, 0.2, 0.3, 0.4]))
>>> m.sample()
tensor([ 0.1294, 0.2324, 0.3859, 0.2523])

View File

@ -15,6 +15,7 @@ class StudentT(Distribution):
Example::
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = StudentT(torch.tensor([2.0]))
>>> m.sample() # Student's t-distributed with degrees of freedom=2
tensor([ 0.1046])

View File

@ -17,6 +17,7 @@ class Uniform(Distribution):
>>> m = Uniform(torch.tensor([0.0]), torch.tensor([5.0]))
>>> m.sample() # uniformly distributed in the range [0.0, 5.0)
>>> # xdoctest: +SKIP
tensor([ 2.3418])
Args:

View File

@ -75,7 +75,8 @@ class VonMises(Distribution):
interpreted as angles modulo 2 pi.
Example::
>>> m = dist.VonMises(torch.tensor([1.0]), torch.tensor([1.0]))
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = VonMises(torch.tensor([1.0]), torch.tensor([1.0]))
>>> m.sample() # von Mises distributed with loc=1 and concentration=1
tensor([1.9777])

View File

@ -14,6 +14,7 @@ class Weibull(TransformedDistribution):
Example:
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
>>> m = Weibull(torch.tensor([1.0]), torch.tensor([1.0]))
>>> m.sample() # sample from a Weibull distribution with scale=1, concentration=1
tensor([ 0.4784])

View File

@ -33,9 +33,10 @@ class Wishart(ExponentialFamily):
or its Cholesky decomposition :math:`\mathbf{\Sigma} = \mathbf{L}\mathbf{L}^\top`
Example:
>>> # xdoctest: +SKIP("FIXME: scale_tril must be at least two-dimensional")
>>> m = Wishart(torch.eye(2), torch.Tensor([2]))
>>> m.sample() # Wishart distributed with mean=`df * I` and
# variance(x_ij)=`df` for i != j and variance(x_ij)=`2 * df` for i == j
>>> # variance(x_ij)=`df` for i != j and variance(x_ij)=`2 * df` for i == j
Args:
covariance_matrix (Tensor): positive-definite covariance matrix

View File

@ -257,15 +257,18 @@ def einsum(*args: Any) -> Tensor:
Examples::
# trace
>>> # trace
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> torch.einsum('ii', torch.randn(4, 4))
tensor(-1.2104)
# diagonal
>>> # diagonal
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> torch.einsum('ii->i', torch.randn(4, 4))
tensor([-0.1034, 0.7952, -0.2433, 0.4545])
# outer product
>>> # outer product
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> x = torch.randn(5)
>>> y = torch.randn(4)
>>> torch.einsum('i,j->ij', x, y)
@ -275,7 +278,8 @@ def einsum(*args: Any) -> Tensor:
[ 0.1713, -0.4291, -0.5802, 0.7350],
[ 0.5704, -1.4290, -1.9323, 2.4480]])
# batch matrix multiplication
>>> # batch matrix multiplication
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> As = torch.randn(3,2,5)
>>> Bs = torch.randn(3,5,4)
>>> torch.einsum('bij,bjk->bik', As, Bs)
@ -288,7 +292,8 @@ def einsum(*args: Any) -> Tensor:
[[ 2.8153, 1.8787, -4.3839, -1.2112],
[ 0.3728, -2.1131, 0.0921, 0.8305]]])
# with sublist format and ellipsis
>>> # with sublist format and ellipsis
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> torch.einsum(As, [..., 0, 1], Bs, [..., 1, 2], [..., 0, 2])
tensor([[[-1.0564, -1.5904, 3.2023, 3.1271],
[-1.6706, -0.8097, -0.8025, -2.1183]],
@ -299,12 +304,12 @@ def einsum(*args: Any) -> Tensor:
[[ 2.8153, 1.8787, -4.3839, -1.2112],
[ 0.3728, -2.1131, 0.0921, 0.8305]]])
# batch permute
>>> # batch permute
>>> A = torch.randn(2, 3, 4, 5)
>>> torch.einsum('...ij->...ji', A).shape
torch.Size([2, 3, 5, 4])
# equivalent to torch.nn.functional.bilinear
>>> # equivalent to torch.nn.functional.bilinear
>>> A = torch.randn(3,5,4)
>>> l = torch.randn(2,5)
>>> r = torch.randn(2,4)
@ -453,6 +458,7 @@ else:
>>> z = torch.sin(torch.sqrt(x * x + y * y))
>>> ax = plt.axes(projection='3d')
>>> ax.plot_surface(x.numpy(), y.numpy(), z.numpy())
>>> # xdoctest: +SKIP
<mpl_toolkits.mplot3d.art3d.Poly3DCollection object at 0x7f8f30d40100>
>>> plt.show()
@ -730,6 +736,7 @@ def _unique_impl(input: Tensor, sorted: bool = True,
>>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long))
>>> output
>>> # xdoctest: +SKIP
tensor([ 2, 3, 1])
>>> output, inverse_indices = torch.unique(
@ -1014,6 +1021,7 @@ def tensordot(a, b, dims=2, out: Optional[torch.Tensor] = None): # noqa: F811
>>> a = torch.randn(3, 4, 5, device='cuda')
>>> b = torch.randn(4, 5, 6, device='cuda')
>>> # xdoctest: +SKIP
>>> c = torch.tensordot(a, b, dims=2).cpu()
tensor([[ 8.3504, -2.5436, 6.2922, 2.7556, -1.0732, 3.2741],
[ 3.3161, 0.0704, 5.0187, -0.4079, -4.3126, 4.8744],
@ -1081,6 +1089,7 @@ def cartesian_prod(*tensors):
>>> a = [1, 2, 3]
>>> b = [4, 5]
>>> # xdoctest: +SKIP
>>> list(itertools.product(a, b))
[(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)]
>>> tensor_a = torch.tensor(a)
@ -1203,6 +1212,7 @@ def atleast_1d(*tensors):
>>> x = torch.randn(2)
>>> x
>>> # xdoctest: +SKIP
tensor([1.4584, 0.7583])
>>> torch.atleast_1d(x)
tensor([1.4584, 0.7583])
@ -1243,6 +1253,7 @@ def atleast_2d(*tensors):
tensor([[1.]])
>>> x = torch.randn(2,2)
>>> x
>>> # xdoctest: +SKIP
tensor([[2.2086, 2.5165],
[0.1757, 0.5194]])
>>> torch.atleast_2d(x)
@ -1280,6 +1291,7 @@ def atleast_3d(*tensors):
tensor([[[0.5000]]])
>>> y = torch.randn(2,2)
>>> y
>>> # xdoctest: +SKIP
tensor([[-0.8079, 0.7460],
[-1.1647, 1.4734]])
>>> torch.atleast_3d(y)
@ -1414,6 +1426,7 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa
>>> a = torch.arange(9, dtype= torch.float) - 4
>>> b = a.reshape((3, 3))
>>> torch.norm(a)
>>> # xdoctest: +SKIP
tensor(7.7460)
>>> torch.norm(b)
tensor(7.7460)
@ -1529,6 +1542,7 @@ def chain_matmul(*matrices, out=None):
>>> c = torch.randn(5, 6)
>>> d = torch.randn(6, 7)
>>> torch.chain_matmul(a, b, c, d)
>>> # xdoctest: +SKIP
tensor([[ -2.3375, -3.9790, -4.1119, -6.6577, 9.5609, -11.5095, -3.2614],
[ 21.4038, 3.3378, -8.4982, -5.2457, -10.2561, -2.4684, 2.7163],
[ -0.9647, -5.8917, -2.3213, -5.2284, 12.8615, -12.2816, -2.5095]])
@ -1621,6 +1635,7 @@ def _lu_impl(A, pivot=True, get_infos=False, out=None):
Example::
>>> # xdoctest: +REQUIRES(--lapack)
>>> A = torch.randn(2, 3, 3)
>>> A_LU, pivots = torch.lu(A)
>>> A_LU

View File

@ -37,6 +37,7 @@ def _reify(o, s):
def reify(e, s):
""" Replace variables of expression with substitution
>>> # xdoctest: +SKIP
>>> x, y = var(), var()
>>> e = (1, x, (3, y))
>>> s = {x: 2, y: 4}

Some files were not shown because too many files have changed in this diff Show More