mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Integrate xdoctest - Rebased (#82797)
This is a new version of #15648 based on the latest master branch. Unlike the previous PR where I fixed a lot of the doctests in addition to integrating xdoctest, I'm going to reduce the scope here. I'm simply going to integrate xdoctest, and then I'm going to mark all of the failing tests as "SKIP". This will let xdoctest run on the dashboards, provide some value, and still let the dashboards pass. I'll leave fixing the doctests themselves to another PR. In my initial commit, I do the bare minimum to get something running with failing dashboards. The few tests that I marked as skip are causing segfaults. Running xdoctest results in 293 failed, 201 passed tests. The next commits will be to disable those tests. (unfortunately I don't have a tool that will insert the `#xdoctest: +SKIP` directive over every failing test, so I'm going to do this mostly manually.) Fixes https://github.com/pytorch/pytorch/issues/71105 @ezyang Pull Request resolved: https://github.com/pytorch/pytorch/pull/82797 Approved by: https://github.com/ezyang
This commit is contained in:
committed by
PyTorch MergeBot
parent
ba90c9f229
commit
4618371da5
@ -164,6 +164,11 @@ pytest-rerunfailures
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
#xdoctest
|
||||
#Description: runs doctests in pytest
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
#PyYAML
|
||||
#Description: data serialization format
|
||||
#Pinned versions:
|
||||
|
@ -17,6 +17,8 @@ pip install "unittest-xml-reporting<=3.2.0,>=2.0.0" \
|
||||
pytest \
|
||||
pytest-xdist \
|
||||
pytest-rerunfailures
|
||||
# TODO: enable xdoctest later
|
||||
# xdoctest
|
||||
|
||||
if [ -z "${CI}" ]; then
|
||||
rm -rf "${WORKSPACE_DIR}"/miniconda3/lib/python3.6/site-packages/torch*
|
||||
|
@ -36,7 +36,8 @@ popd
|
||||
=======
|
||||
:: Pin unittest-xml-reporting to freeze printing test summary logic, related: https://github.com/pytorch/pytorch/issues/69014
|
||||
|
||||
pip install "ninja==1.10.0.post1" future "hypothesis==5.35.1" "expecttest==0.1.3" "librosa>=0.6.2" "scipy==1.6.3" psutil pillow "unittest-xml-reporting<=3.2.0,>=2.0.0" pytest pytest-xdist pytest-rerunfailures
|
||||
pip install "ninja==1.10.0.post1" future "hypothesis==5.35.1" "expecttest==0.1.3" "librosa>=0.6.2" "scipy==1.6.3" psutil pillow "unittest-xml-reporting<=3.2.0,>=2.0.0" pytest pytest-xdist pytest-rerunfailures
|
||||
:: # TODO: enable xdoctest later
|
||||
if errorlevel 1 exit /b
|
||||
if not errorlevel 0 exit /b
|
||||
|
||||
|
@ -490,6 +490,51 @@ def coverage_post_process(app, exception):
|
||||
for o in output:
|
||||
f.write(o)
|
||||
|
||||
|
||||
def process_docstring(app, what_, name, obj, options, lines):
|
||||
"""
|
||||
Custom process to transform docstring lines Remove "Ignore" blocks
|
||||
|
||||
Args:
|
||||
app (sphinx.application.Sphinx): the Sphinx application object
|
||||
|
||||
what (str):
|
||||
the type of the object which the docstring belongs to (one of
|
||||
"module", "class", "exception", "function", "method", "attribute")
|
||||
|
||||
name (str): the fully qualified name of the object
|
||||
|
||||
obj: the object itself
|
||||
|
||||
options: the options given to the directive: an object with
|
||||
attributes inherited_members, undoc_members, show_inheritance
|
||||
and noindex that are true if the flag option of same name was
|
||||
given to the auto directive
|
||||
|
||||
lines (List[str]): the lines of the docstring, see above
|
||||
|
||||
References:
|
||||
https://www.sphinx-doc.org/en/1.5.1/_modules/sphinx/ext/autodoc.html
|
||||
https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html
|
||||
"""
|
||||
import re
|
||||
remove_directives = [
|
||||
# Remove all xdoctest directives
|
||||
re.compile(r'\s*>>>\s*#\s*x?doctest:\s*.*'),
|
||||
re.compile(r'\s*>>>\s*#\s*x?doc:\s*.*'),
|
||||
]
|
||||
filtered_lines = [
|
||||
line for line in lines
|
||||
if not any(pat.match(line) for pat in remove_directives)
|
||||
]
|
||||
# Modify the lines inplace
|
||||
lines[:] = filtered_lines
|
||||
|
||||
# make sure there is a blank line at the end
|
||||
if lines and lines[-1].strip():
|
||||
lines.append('')
|
||||
|
||||
|
||||
# Called automatically by Sphinx, making this `conf.py` an "extension".
|
||||
def setup(app):
|
||||
# NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value
|
||||
@ -506,6 +551,7 @@ def setup(app):
|
||||
add_css(css_file)
|
||||
|
||||
app.connect("build-finished", coverage_post_process)
|
||||
app.connect('autodoc-process-docstring', process_docstring)
|
||||
|
||||
# From PyTorch 1.5, we now use autogenerated files to document classes and
|
||||
# functions. This breaks older references since
|
||||
|
@ -5,6 +5,7 @@ dependencies:
|
||||
- numpy
|
||||
- pytest
|
||||
- pytest-cov
|
||||
- xdoctest
|
||||
- codecov
|
||||
- pip
|
||||
- pyyaml
|
||||
|
@ -7,6 +7,11 @@ addopts =
|
||||
# capture only Python print and C++ py::print, but not C output (low-level Python errors)
|
||||
--capture=sys
|
||||
--disable-warnings
|
||||
# TODO: enable xdoctest later
|
||||
#--xdoctest
|
||||
#--xdoctest-style=google
|
||||
#--xdoctest-global-exec="from torch import nn\nimport torch.nn.functional as F\nimport torch"
|
||||
#--xdoctest-options=+IGNORE_WHITESPACE
|
||||
testpaths =
|
||||
test
|
||||
junit_logging_reruns = all
|
||||
|
29
test/run_doctests.sh
Executable file
29
test/run_doctests.sh
Executable file
@ -0,0 +1,29 @@
|
||||
#!/bin/bash
|
||||
__doc__="
|
||||
This script simply runs the torch doctests via the xdoctest runner.
|
||||
|
||||
This must be run from the root of the torch repo, as it needs the path to the
|
||||
torch source code.
|
||||
"
|
||||
|
||||
#xdoctest -m torch --style=google list
|
||||
|
||||
# Reference: https://stackoverflow.com/questions/59895/bash-script-dir
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
TORCH_MODPATH=$SCRIPT_DIR/../torch
|
||||
echo "TORCH_MODPATH = $TORCH_MODPATH"
|
||||
|
||||
if [[ ! -d "$TORCH_MODPATH" ]] ; then
|
||||
echo "Could not find the path to the torch module"
|
||||
else
|
||||
|
||||
# Next version of xdoctest will support environment variables that overlo
|
||||
|
||||
|
||||
export XDOCTEST_GLOBAL_EXEC="from torch import nn\nimport torch.nn.functional as F\nimport torch"
|
||||
export XDOCTEST_OPTIONS="+IGNORE_WHITESPACE"
|
||||
# Note: google wont catch numpy style docstrings (a few exist) but it also wont fail
|
||||
# on things not intended to be doctests.
|
||||
export XDOCTEST_STYLE="google"
|
||||
xdoctest "$TORCH_MODPATH" --style="$XDOCTEST_STYLE" --global-exec "$XDOCTEST_GLOBAL_EXEC" --options="$XDOCTEST_OPTIONS"
|
||||
fi
|
@ -348,6 +348,20 @@ def get_executable_command(options, allow_pytest, disable_coverage=False):
|
||||
if options.pytest:
|
||||
if allow_pytest:
|
||||
executable += ["-m", "pytest"]
|
||||
# Enable xdoctest
|
||||
# TODO: enable xdoctest later
|
||||
# Many doctests assume the existence of these variables
|
||||
# xdoctest_global_exec_lines = r'\n'.join([
|
||||
# 'from torch import nn',
|
||||
# 'import torch.nn.functional as F',
|
||||
# 'import torch',
|
||||
# ])
|
||||
# executable += [
|
||||
# "--xdoctest",
|
||||
# "--xdoctest-style=google",
|
||||
# f"--xdoctest-global-exec='{xdoctest_global_exec_lines}'",
|
||||
# "--xdoctest-options=+IGNORE_WHITESPACE"
|
||||
# ]
|
||||
else:
|
||||
print_to_stderr(
|
||||
"Pytest cannot be used for this test. Falling back to unittest."
|
||||
|
@ -318,6 +318,7 @@ def set_default_tensor_type(t):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP("Other tests may have changed the default type. Can we reset it?")
|
||||
>>> torch.tensor([1.2, 3]).dtype # initial default for floating point is torch.float32
|
||||
torch.float32
|
||||
>>> torch.set_default_tensor_type(torch.DoubleTensor)
|
||||
@ -354,6 +355,7 @@ def set_default_dtype(d):
|
||||
Either torch.float32 or torch.float64.
|
||||
|
||||
Example:
|
||||
>>> # xdoctest: +SKIP("Other tests may have changed the default type. Can we reset it?")
|
||||
>>> # initial default for floating point is torch.float32
|
||||
>>> # Python floats are interpreted as float32
|
||||
>>> torch.tensor([1.2, 3]).dtype
|
||||
@ -493,6 +495,7 @@ def use_deterministic_algorithms(mode, *, warn_only=False):
|
||||
>>> torch.use_deterministic_algorithms(True)
|
||||
|
||||
# Forward mode nondeterministic error
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> torch.randn(10, device='cuda').kthvalue(0)
|
||||
...
|
||||
RuntimeError: kthvalue CUDA does not have a deterministic implementation...
|
||||
|
@ -128,6 +128,7 @@ def update_names(tensor, names, rename_map, inplace):
|
||||
|
||||
>>> x.rename('batch', '...', 'width').names
|
||||
('batch', 'C', 'H', 'width')
|
||||
|
||||
```
|
||||
|
||||
tensor.rename(**rename_map) returns a view on tensor that has rename dims
|
||||
@ -138,6 +139,7 @@ def update_names(tensor, names, rename_map, inplace):
|
||||
>>> x = torch.empty(2, 3, 5, 7, names=('N', 'C', 'H', 'W'))
|
||||
>>> x.rename(W='width', H='height').names
|
||||
('N', 'C', 'height', 'width')
|
||||
|
||||
```
|
||||
|
||||
Finally, tensor.rename has an in-place version called tensor.rename_.
|
||||
|
@ -103,6 +103,7 @@ class TorchRefsMode(torch.overrides.TorchFunctionMode):
|
||||
Switches the interpretation of torch.* functions and Tensor methods to
|
||||
use PrimTorch refs in torch._refs. (Direct calls to _refs are unaffected.)
|
||||
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> with TorchRefsMode():
|
||||
... torch.add(x, y) # calls torch._refs.add(x, y)
|
||||
|
||||
|
@ -1197,7 +1197,7 @@ class Tensor(torch._C._TensorBase):
|
||||
|
||||
>>> renamed_imgs = imgs.rename(None)
|
||||
>>> renamed_imgs.names
|
||||
(None,)
|
||||
(None, None, None, None)
|
||||
|
||||
>>> renamed_imgs = imgs.rename('batch', 'channel', 'height', 'width')
|
||||
>>> renamed_imgs.names
|
||||
|
@ -135,6 +135,7 @@ def fuse_modules(model, modules_to_fuse, inplace=False, fuser_func=fuse_known_mo
|
||||
|
||||
Examples::
|
||||
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> m = M().eval()
|
||||
>>> # m is a module containing the sub-modules below
|
||||
>>> modules_to_fuse = [ ['conv1', 'bn1', 'relu1'], ['submodule.conv', 'submodule.relu']]
|
||||
|
@ -21,6 +21,7 @@ def fuse_conv_bn(is_qat, conv, bn):
|
||||
|
||||
>>> m1 = nn.Conv2d(10, 20, 3)
|
||||
>>> b1 = nn.BatchNorm2d(20)
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> m2 = fuse_conv_bn(m1, b1)
|
||||
"""
|
||||
assert(conv.training == bn.training),\
|
||||
@ -58,6 +59,7 @@ def fuse_conv_bn_relu(is_qat, conv, bn, relu):
|
||||
>>> m1 = nn.Conv2d(10, 20, 3)
|
||||
>>> b1 = nn.BatchNorm2d(20)
|
||||
>>> r1 = nn.ReLU(inplace=False)
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> m2 = fuse_conv_bn_relu(m1, b1, r1)
|
||||
"""
|
||||
assert(conv.training == bn.training == relu.training),\
|
||||
@ -103,6 +105,7 @@ def fuse_linear_bn(is_qat, linear, bn):
|
||||
|
||||
>>> m1 = nn.Linear(20, 10)
|
||||
>>> b1 = nn.BatchNorm1d(10)
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> m2 = fuse_linear_bn(m1, b1)
|
||||
"""
|
||||
assert(linear.training == bn.training),\
|
||||
@ -130,6 +133,7 @@ def fuse_convtranspose_bn(is_qat, convt, bn):
|
||||
|
||||
>>> m1 = nn.ConvTranspose2d(10, 20, 3)
|
||||
>>> b1 = nn.BatchNorm2d(20)
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> m2 = fuse_convtranspose_bn(m1, b1)
|
||||
"""
|
||||
assert(convt.training == bn.training),\
|
||||
|
@ -74,6 +74,7 @@ class ModelReport:
|
||||
8.) Call model_report.generate_qconfigs to generate the qconfigs based on the report suggestions
|
||||
|
||||
Example (with QuantizationTracer):
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> # get the necessary qconfig
|
||||
>>> config = PrepareCustomConfig()
|
||||
>>> skipped_module_names, skipped_module_classes = get_skipped_module_name_and_classes(config, False)
|
||||
|
@ -321,10 +321,11 @@ class ModelReportVisualizer:
|
||||
The rest of the rows will contain data
|
||||
|
||||
Example Use:
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> mod_report_visualizer.generate_filtered_tables(
|
||||
feature_filter = "per_channel_min",
|
||||
module_fqn_filter = "block1"
|
||||
) # generates table with per_channel_min info for all modules in block 1 of the model
|
||||
... feature_filter = "per_channel_min",
|
||||
... module_fqn_filter = "block1"
|
||||
... ) # generates table with per_channel_min info for all modules in block 1 of the model
|
||||
"""
|
||||
# first get the filtered data
|
||||
filtered_data: OrderedDict[str, Any] = self._get_filtered_data(feature_filter, module_fqn_filter)
|
||||
@ -403,12 +404,13 @@ class ModelReportVisualizer:
|
||||
Default = "", results in all the modules in the reports to be visible in the table
|
||||
|
||||
Example Use:
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> mod_report_visualizer.generate_table_visualization(
|
||||
feature_filter = "per_channel_min",
|
||||
module_fqn_filter = "block1"
|
||||
)
|
||||
# prints out neatly formatted table with per_channel_min info for
|
||||
all modules in block 1 of the model
|
||||
... feature_filter = "per_channel_min",
|
||||
... module_fqn_filter = "block1"
|
||||
... )
|
||||
>>> # prints out neatly formatted table with per_channel_min info
|
||||
>>> # for all modules in block 1 of the model
|
||||
"""
|
||||
# see if we got tabulate
|
||||
if not got_tabulate:
|
||||
@ -552,13 +554,14 @@ class ModelReportVisualizer:
|
||||
Default = "", results in all the modules in the reports to be visible in the table
|
||||
|
||||
Example Use:
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> mod_report_visualizer.generate_plot_visualization(
|
||||
feature_filter = "per_channel_min",
|
||||
module_fqn_filter = "block1"
|
||||
)
|
||||
# outputs line plot of per_channel_min information for all modules in block1 of model
|
||||
each channel gets it's own line, and it's plotted across the in-order modules
|
||||
on the x-axis
|
||||
... feature_filter = "per_channel_min",
|
||||
... module_fqn_filter = "block1"
|
||||
... )
|
||||
>>> # outputs line plot of per_channel_min information for all
|
||||
>>> # modules in block1 of model each channel gets it's own line,
|
||||
>>> # and it's plotted across the in-order modules on the x-axis
|
||||
"""
|
||||
# checks if we have matplotlib and let's user know to install it if don't
|
||||
if not got_matplotlib:
|
||||
@ -613,10 +616,11 @@ class ModelReportVisualizer:
|
||||
Default = 10, the values will be split into 10 equal sized bins
|
||||
|
||||
Example Use:
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> mod_report_visualizer.generategenerate_histogram_visualization_plot_visualization(
|
||||
feature_filter = "per_channel_min",
|
||||
module_fqn_filter = "block1"
|
||||
)
|
||||
... feature_filter = "per_channel_min",
|
||||
... module_fqn_filter = "block1"
|
||||
... )
|
||||
# outputs histogram of per_channel_min information for all modules in block1 of model
|
||||
information is gathered across all channels for all modules in block 1 for the
|
||||
per_channel_min and is displayed in a histogram of equally sized bins
|
||||
|
@ -83,6 +83,7 @@ def _with_args(cls_or_self, **kwargs):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP("Undefined vars")
|
||||
>>> Foo.with_args = classmethod(_with_args)
|
||||
>>> foo_builder = Foo.with_args(a=3, b=4).with_args(answer=42)
|
||||
>>> foo_instance1 = foo_builder()
|
||||
@ -103,11 +104,12 @@ def _with_callable_args(cls_or_self, **kwargs):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP("Undefined vars")
|
||||
>>> Foo.with_callable_args = classmethod(_with_callable_args)
|
||||
>>> Foo.with_args = classmethod(_with_args)
|
||||
>>> foo_builder = Foo.with_callable_args(cur_time=get_time_func).with_args(name="dan")
|
||||
>>> foo_instance1 = foo_builder()
|
||||
>>> wait 50
|
||||
>>> # wait 50
|
||||
>>> foo_instance2 = foo_builder()
|
||||
>>> id(foo_instance1.creation_time) == id(foo_instance2.creation_time)
|
||||
False
|
||||
|
@ -30,32 +30,33 @@ class ActivationSparsifier:
|
||||
specifies how inputs should be aggregated over time.
|
||||
The aggregate_fn should usually take 2 torch tensors and return the aggregated tensor.
|
||||
Example
|
||||
>>> def add_agg_fn(tensor1, tensor2): return tensor1 + tensor2
|
||||
reduce_fn (Optional, Callable):
|
||||
default reduce_fn that is used if not specified while registering the layer.
|
||||
reduce_fn will be called on the aggregated tensor i.e. the tensor obtained after
|
||||
calling agg_fn() on all inputs.
|
||||
Example
|
||||
>>> def mean_reduce_fn(agg_tensor): return agg_tensor.mean(dim=0)
|
||||
mask_fn (Optional, Callable):
|
||||
default mask_fn that is used to create the sparsification mask using the tensor obtained after
|
||||
calling the reduce_fn(). This is used by default if a custom one is passed in the
|
||||
register_layer().
|
||||
Note that the mask_fn() definition should contain the sparse arguments that is passed in sparse_config
|
||||
arguments.
|
||||
features (Optional, list):
|
||||
default selected features to sparsify.
|
||||
If this is non-empty, then the mask_fn will be applied for each feature of the input.
|
||||
For example,
|
||||
>>> mask = [mask_fn(reduce_fn(aggregated_fn(input[feature])) for feature in features]
|
||||
feature_dim (Optional, int):
|
||||
default dimension of input features. Again, features along this dim will be chosen
|
||||
for sparsification.
|
||||
sparse_config (Dict):
|
||||
Default configuration for the mask_fn. This config will be passed
|
||||
with the mask_fn()
|
||||
def add_agg_fn(tensor1, tensor2): return tensor1 + tensor2
|
||||
reduce_fn (Optional, Callable):
|
||||
default reduce_fn that is used if not specified while registering the layer.
|
||||
reduce_fn will be called on the aggregated tensor i.e. the tensor obtained after
|
||||
calling agg_fn() on all inputs.
|
||||
Example
|
||||
def mean_reduce_fn(agg_tensor): return agg_tensor.mean(dim=0)
|
||||
mask_fn (Optional, Callable):
|
||||
default mask_fn that is used to create the sparsification mask using the tensor obtained after
|
||||
calling the reduce_fn(). This is used by default if a custom one is passed in the
|
||||
register_layer().
|
||||
Note that the mask_fn() definition should contain the sparse arguments that is passed in sparse_config
|
||||
arguments.
|
||||
features (Optional, list):
|
||||
default selected features to sparsify.
|
||||
If this is non-empty, then the mask_fn will be applied for each feature of the input.
|
||||
For example,
|
||||
mask = [mask_fn(reduce_fn(aggregated_fn(input[feature])) for feature in features]
|
||||
feature_dim (Optional, int):
|
||||
default dimension of input features. Again, features along this dim will be chosen
|
||||
for sparsification.
|
||||
sparse_config (Dict):
|
||||
Default configuration for the mask_fn. This config will be passed
|
||||
with the mask_fn()
|
||||
|
||||
Example:
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> model = SomeModel()
|
||||
>>> act_sparsifier = ActivationSparsifier(...) # init activation sparsifier
|
||||
>>> # Initialize aggregate_fn
|
||||
@ -74,6 +75,7 @@ class ActivationSparsifier:
|
||||
>>> act_sparsifier.register_layer(model.some_layer, aggregate_fn=agg_fn, reduce_fn=reduce_fn, mask_fn=mask_fn)
|
||||
>>>
|
||||
>>> # start training process
|
||||
>>> for _ in [...]:
|
||||
>>> # epoch starts
|
||||
>>> # model.forward(), compute_loss() and model.backwards()
|
||||
>>> # epoch ends
|
||||
|
@ -89,11 +89,11 @@ class BaseDataScheduler(object):
|
||||
is called.
|
||||
|
||||
Example:
|
||||
>>> def get_schedule_param(self):
|
||||
new_param = {}
|
||||
for name in self.sparsifier.data_groups.keys():
|
||||
new_param[name] = self.sparsifier.data_groups[name][self.schedule_param] * 0.5
|
||||
return new_param
|
||||
>>> def get_schedule_param(self):
|
||||
... new_param = {}
|
||||
... for name in self.sparsifier.data_groups.keys():
|
||||
... new_param[name] = self.sparsifier.data_groups[name][self.schedule_param] * 0.5
|
||||
... return new_param
|
||||
|
||||
When the step() function is called, the value in self.sparsifier.data_groups[name][self.schedule_param]
|
||||
would be halved
|
||||
|
@ -51,7 +51,7 @@ class BaseDataSparsifier(base_sparsifier.BaseSparsifier):
|
||||
configuration. Only the keys that don't exist in the `config` will
|
||||
be updated.
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> data_list = [('tensor_1', torch.randn(3,3)), ('tensor_2', torch.randn(4,4))]
|
||||
>>> defaults = {'sparsity_level': 0.7}
|
||||
>>> sparsifier = DerivedDataSparsifier(data_list = data_list, **defaults) # Some sparsifier that inherits BaseDataSparsifier
|
||||
|
@ -19,6 +19,7 @@ class LambdaSL(BaseScheduler):
|
||||
>>> # Assuming sparsifier has two groups.
|
||||
>>> lambda1 = lambda epoch: epoch // 30
|
||||
>>> lambda2 = lambda epoch: 0.95 ** epoch
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> scheduler = LambdaSL(sparsifier, sl_lambda=[lambda1, lambda2])
|
||||
>>> for epoch in range(100):
|
||||
>>> train(...)
|
||||
|
@ -43,7 +43,8 @@ class BaseSparsifier(abc.ABC):
|
||||
|
||||
Example::
|
||||
|
||||
>>> config = [{'tensor_fqn': 'layer1.weight', {'tensor_fqn': 'linear2.weight2', 'sparsity_level': 0.5}]
|
||||
>>> # xdoctest: +SKIP("Can't instantiate abstract class BaseSparsifier with abstract method update_mask")
|
||||
>>> config = [{'tensor_fqn': 'layer1.weight', 'tensor_fqn': 'linear2.weight2', 'sparsity_level': 0.5}]
|
||||
>>> defaults = {'sparsity_level': 0.7}
|
||||
>>> # model.layer1.weight will have `sparsity_level` = 0.7 (getting default)
|
||||
>>> sparsifier = BaseSparsifier(config, defaults)
|
||||
@ -233,6 +234,7 @@ class BaseSparsifier(abc.ABC):
|
||||
to save in the `sparse_params`
|
||||
|
||||
Examples:
|
||||
>>> # xdoctest: +SKIP("locals are undefined")
|
||||
>>> # Don't save any sparse params
|
||||
>>> sparsifier.squash_mask()
|
||||
>>> hasattr(model.submodule1, 'sparse_params')
|
||||
|
@ -56,6 +56,7 @@ def make_dual(tensor, tangent, *, level=None):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP("Undefined variables")
|
||||
>>> with dual_level():
|
||||
... inp = make_dual(x, v)
|
||||
... out = f(inp)
|
||||
@ -95,6 +96,7 @@ def unpack_dual(tensor, *, level=None):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP("Undefined variables")
|
||||
>>> with dual_level():
|
||||
... inp = make_dual(x, x_t)
|
||||
... out = f(inp)
|
||||
@ -130,6 +132,7 @@ class dual_level(_DecoratorContextManager):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP("Undefined variables")
|
||||
>>> x = torch.tensor([1])
|
||||
>>> x_t = torch.tensor([1])
|
||||
>>> with dual_level():
|
||||
|
@ -83,6 +83,7 @@ class FunctionCtx(object):
|
||||
See :ref:`extending-autograd` for more details on how to use this method.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> class Func(torch.autograd.Function):
|
||||
>>> @staticmethod
|
||||
>>> def forward(ctx, x: torch.Tensor, y: torch.Tensor, z: int):
|
||||
@ -149,6 +150,7 @@ class FunctionCtx(object):
|
||||
>>> b = a * a
|
||||
>>> Inplace.apply(a) # This would lead to wrong gradients!
|
||||
>>> # but the engine would not know unless we mark_dirty
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> b.backward() # RuntimeError: one of the variables needed for gradient
|
||||
>>> # computation has been modified by an inplace operation
|
||||
|
||||
@ -314,6 +316,7 @@ class Function(with_metaclass(FunctionMeta, _C._FunctionBase, FunctionCtx, _Hook
|
||||
>>> return grad_output * result
|
||||
>>>
|
||||
>>> # Use it by calling the apply method:
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> output = Exp.apply(input)
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
@ -240,6 +240,7 @@ def vjp(func, inputs, v=None, create_graph=False, strict=False):
|
||||
... return x.exp().sum(dim=1)
|
||||
>>> inputs = torch.rand(4, 4)
|
||||
>>> v = torch.ones(4)
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
|
||||
>>> vjp(exp_reducer, inputs, v)
|
||||
(tensor([5.7817, 7.2458, 5.7830, 6.7782]),
|
||||
tensor([[1.4458, 1.3962, 1.3042, 1.6354],
|
||||
@ -336,6 +337,7 @@ def jvp(func, inputs, v=None, create_graph=False, strict=False):
|
||||
... return x.exp().sum(dim=1)
|
||||
>>> inputs = torch.rand(4, 4)
|
||||
>>> v = torch.ones(4, 4)
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
|
||||
>>> jvp(exp_reducer, inputs, v)
|
||||
(tensor([6.3090, 4.6742, 7.9114, 8.2106]),
|
||||
tensor([6.3090, 4.6742, 7.9114, 8.2106]))
|
||||
@ -535,6 +537,7 @@ def jacobian(func, inputs, create_graph=False, strict=False, vectorize=False, st
|
||||
>>> def exp_reducer(x):
|
||||
... return x.exp().sum(dim=1)
|
||||
>>> inputs = torch.rand(2, 2)
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
|
||||
>>> jacobian(exp_reducer, inputs)
|
||||
tensor([[[1.4917, 2.4352],
|
||||
[0.0000, 0.0000]],
|
||||
@ -744,6 +747,7 @@ def hessian(func, inputs, create_graph=False, strict=False, vectorize=False, out
|
||||
>>> def pow_reducer(x):
|
||||
... return x.pow(3).sum()
|
||||
>>> inputs = torch.rand(2, 2)
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
|
||||
>>> hessian(pow_reducer, inputs)
|
||||
tensor([[[[5.2265, 0.0000],
|
||||
[0.0000, 0.0000]],
|
||||
@ -847,6 +851,7 @@ def vhp(func, inputs, v=None, create_graph=False, strict=False):
|
||||
... return x.pow(3).sum()
|
||||
>>> inputs = torch.rand(2, 2)
|
||||
>>> v = torch.ones(2, 2)
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
|
||||
>>> vhp(pow_reducer, inputs, v)
|
||||
(tensor(0.5591),
|
||||
tensor([[1.0689, 1.2431],
|
||||
@ -936,6 +941,7 @@ def hvp(func, inputs, v=None, create_graph=False, strict=False):
|
||||
... return x.pow(3).sum()
|
||||
>>> inputs = torch.rand(2, 2)
|
||||
>>> v = torch.ones(2, 2)
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
|
||||
>>> hvp(pow_reducer, inputs, v)
|
||||
(tensor(0.1448),
|
||||
tensor([[2.0239, 1.6456],
|
||||
|
@ -110,7 +110,7 @@ class no_grad(_DecoratorContextManager):
|
||||
your dual tensors.
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> x = torch.tensor([1.], requires_grad=True)
|
||||
>>> with torch.no_grad():
|
||||
... y = x * 2
|
||||
@ -156,7 +156,7 @@ class enable_grad(_DecoratorContextManager):
|
||||
This API does not apply to :ref:`forward-mode AD <forward-mode-ad>`.
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> x = torch.tensor([1.], requires_grad=True)
|
||||
>>> with torch.no_grad():
|
||||
... with torch.enable_grad():
|
||||
@ -165,6 +165,7 @@ class enable_grad(_DecoratorContextManager):
|
||||
True
|
||||
>>> y.backward()
|
||||
>>> x.grad
|
||||
tensor([2.])
|
||||
>>> @torch.enable_grad()
|
||||
... def doubler(x):
|
||||
... return x * 2
|
||||
@ -205,18 +206,18 @@ class set_grad_enabled(_DecoratorContextManager):
|
||||
This API does not apply to :ref:`forward-mode AD <forward-mode-ad>`.
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> x = torch.tensor([1.], requires_grad=True)
|
||||
>>> is_train = False
|
||||
>>> with torch.set_grad_enabled(is_train):
|
||||
... y = x * 2
|
||||
>>> y.requires_grad
|
||||
False
|
||||
>>> torch.set_grad_enabled(True)
|
||||
>>> _ = torch.set_grad_enabled(True)
|
||||
>>> y = x * 2
|
||||
>>> y.requires_grad
|
||||
True
|
||||
>>> torch.set_grad_enabled(False)
|
||||
>>> _ = torch.set_grad_enabled(False)
|
||||
>>> y = x * 2
|
||||
>>> y.requires_grad
|
||||
False
|
||||
@ -268,6 +269,7 @@ class inference_mode(_DecoratorContextManager):
|
||||
... y = x * x
|
||||
>>> y.requires_grad
|
||||
False
|
||||
>>> # xdoctest: +SKIP("want string isnt quite right")
|
||||
>>> y._version
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
|
@ -47,11 +47,11 @@ class saved_tensors_hooks():
|
||||
>>> b = torch.ones(5, requires_grad=True) * 2
|
||||
>>> with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook):
|
||||
... y = a * b
|
||||
Packing tensor([1., 1., 1., 1., 1.])
|
||||
Packing tensor([2., 2., 2., 2., 2.])
|
||||
Packing tensor([1., 1., 1., 1., 1.], requires_grad=True)
|
||||
Packing tensor([2., 2., 2., 2., 2.], grad_fn=<MulBackward0>)
|
||||
>>> y.sum().backward()
|
||||
Unpacking tensor([1., 1., 1., 1., 1.])
|
||||
Unpacking tensor([2., 2., 2., 2., 2.])
|
||||
Unpacking tensor([1., 1., 1., 1., 1.], requires_grad=True)
|
||||
Unpacking tensor([2., 2., 2., 2., 2.], grad_fn=<MulBackward0>)
|
||||
|
||||
.. warning ::
|
||||
Performing an inplace operation on the input to either hooks may lead
|
||||
@ -93,6 +93,7 @@ class save_on_cpu(saved_tensors_hooks):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +REQUIRES(env:CUDAHOME)
|
||||
>>> a = torch.randn(5, requires_grad=True, device="cuda")
|
||||
>>> b = torch.randn(5, requires_grad=True, device="cuda")
|
||||
>>> c = torch.randn(5, requires_grad=True, device="cuda")
|
||||
|
@ -118,11 +118,12 @@ class profile(object):
|
||||
please use ``use_cuda = False`` or ``num_workers = 0``.
|
||||
|
||||
Example:
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> x = torch.randn((1, 1), requires_grad=True)
|
||||
>>> with torch.autograd.profiler.profile() as prof:
|
||||
>>> for _ in range(100): # any normal python code, really!
|
||||
>>> y = x ** 2
|
||||
>> y.backward()
|
||||
>>> y.backward()
|
||||
>>> # NOTE: some columns were removed for brevity
|
||||
>>> print(prof.key_averages().table(sort_by="self_cpu_time_total"))
|
||||
----------------------------------- --------------- --------------- ---------------
|
||||
@ -443,6 +444,7 @@ class record_function(ContextDecorator):
|
||||
... z = y ** 3
|
||||
... y.backward()
|
||||
...
|
||||
>>> # xdoctest: +IGNORE_WANT
|
||||
>>> # NOTE: some columns were removed for brevity
|
||||
>>> print(prof.key_averages().table(sort_by="self_cpu_time_total"))
|
||||
----------------------------------- --------------- --------------- ---------------
|
||||
@ -535,6 +537,7 @@ class emit_itt(object):
|
||||
Default: ``False``
|
||||
|
||||
Example:
|
||||
>>> # xdoctest: +SKIP("Undefined variables")
|
||||
>>> with torch.autograd.profiler.emit_itt():
|
||||
... model(x)
|
||||
|
||||
@ -602,6 +605,7 @@ class emit_nvtx(object):
|
||||
Default: ``False``
|
||||
|
||||
Example:
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> with torch.cuda.profiler.profile():
|
||||
... model(x) # Warmup CUDA memory allocator and profiler
|
||||
... with torch.autograd.profiler.emit_nvtx():
|
||||
|
@ -35,6 +35,7 @@ For example, you can the torch.linalg.inv function will raise torch.linalg.LinAl
|
||||
a matrix is not invertible.\n \
|
||||
\n\
|
||||
Example:\n \
|
||||
>>> # xdoctest: +REQUIRES(--lapac)\n \
|
||||
>>> matrix = torch.eye(3, 3)\n \
|
||||
>>> matrix[-1, -1] = 0\n \
|
||||
>>> matrix\n \
|
||||
|
@ -126,6 +126,7 @@ def load_state_dict(
|
||||
None.
|
||||
|
||||
Examples
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> my_model = MyModule()
|
||||
>>> optimizer = Adagrad(my_model.parameters())
|
||||
>>> model_state_dict = my_model.state_dict()
|
||||
|
@ -128,6 +128,7 @@ def save_state_dict(
|
||||
no_dist (bool): Don't attempt to save in SPMD style. Default to False
|
||||
|
||||
Example:
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> my_model = MyModule()
|
||||
>>> # We must call this function prior to state_dict()
|
||||
>>> my_model._register_state_dict_hook(state_dict_hook)
|
||||
|
@ -60,6 +60,7 @@ class _PartialTensor(torch.Tensor):
|
||||
Examples:
|
||||
>>> # All tensors below are of torch.int64 type.
|
||||
>>> # We have 2 process groups, 2 ranks.
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
|
||||
>>> tensor = torch.cat([tensor, tensor + 2])
|
||||
>>> tensor
|
||||
|
@ -30,6 +30,7 @@ def named_params_with_sharded_tensor(
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> model = torch.nn.Linear(*linear_size)
|
||||
>>> shard_parameter(model, "weight", spec)
|
||||
>>> for name, param in named_params_with_sharded_tensor(model):
|
||||
|
@ -363,22 +363,24 @@ def init_from_local_shards(
|
||||
|
||||
|
||||
Examples:
|
||||
Suppose we want construct a sharded tensor on two ranks, global size = (10, 5),
|
||||
each shard have a (5, 5) local tensor, we can do it like below:
|
||||
Suppose we want construct a sharded tensor on two ranks, global size = (10, 5),
|
||||
each shard have a (5, 5) local tensor, we can do it like below:
|
||||
|
||||
on rank 0:
|
||||
on rank 0:
|
||||
>>> # xdoctest: +SKIP("not distributed")
|
||||
>>> local_shard_metadata = ShardMetadata(
|
||||
>>> shard_offsets=[0, 0]
|
||||
>>> shard_lengths=[5, 5]
|
||||
>>> shard_offsets=[0, 0],
|
||||
>>> shard_lengths=[5, 5],
|
||||
>>> placement="rank:0/cuda:0"
|
||||
>>> )
|
||||
>>> local_shards = [Shard(torch.randn(5, 5), local_shard_metadata)]
|
||||
>>> sharded_tensor = init_from_local_shards(local_shards, [10, 5])
|
||||
|
||||
on rank 1:
|
||||
on rank 1:
|
||||
>>> # xdoctest: +SKIP("not distributed")
|
||||
>>> local_shard_metadata = ShardMetadata(
|
||||
>>> shard_offsets=[5, 0]
|
||||
>>> shard_lengths=[5, 5]
|
||||
>>> shard_offsets=[5, 0],
|
||||
>>> shard_lengths=[5, 5],
|
||||
>>> placement="rank:1/cuda:1"
|
||||
>>> )
|
||||
>>> local_shards = [Shard(torch.randn(5, 5), local_shard_metadata)]
|
||||
@ -427,8 +429,8 @@ def custom_sharded_op_impl(func):
|
||||
Example::
|
||||
>>> @custom_sharded_op_impl(torch.nn.functional.linear)
|
||||
>>> def my_custom_sharded_linear(types, args, kwargs, process_group):
|
||||
>>> ....
|
||||
>>>
|
||||
>>> ...
|
||||
>>> # xdoctest: +SKIP("Undefined variables")
|
||||
>>> input = torch.rand(10, 32)
|
||||
>>> weight = sharded_tensor.rand(32, 16)
|
||||
>>> bias = torch.rand(16)
|
||||
|
@ -12,11 +12,12 @@ def _sharded_op_common(op, early_stop_func, extra_check):
|
||||
different behaviors are done on either local shards or a local tensor.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP("Undefined variables")
|
||||
>>> op = torch.transpose
|
||||
>>> @_sharded_op_impl(op)
|
||||
>>> @_sharded_op_common(op, early_stop_func, extra_check)
|
||||
>>> def sharded_tensor_op(types, args, kwargs, process_group):
|
||||
>>> ....
|
||||
>>> ...
|
||||
>>>
|
||||
>>> st = sharded_tensor.rand(32, 16)
|
||||
>>> st.transpose(1, 2)
|
||||
|
@ -801,6 +801,7 @@ class ShardedTensor(ShardedTensorBase):
|
||||
Examples:
|
||||
>>> # All tensors below are of torch.int64 type.
|
||||
>>> # We have 2 process groups, 2 ranks.
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
|
||||
>>> local_tensor = torch.unsqueeze(torch.cat([tensor, tensor + 2]))
|
||||
>>> local_tensor
|
||||
@ -949,6 +950,7 @@ class ShardedTensor(ShardedTensorBase):
|
||||
|
||||
Examples:
|
||||
>>> # We have 2 process groups, 2 ranks.
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> tensor = torch.arange(4, dtype=torch.int64) + 1 + 2 * rank
|
||||
>>> tensor = torch.stack([tensor, tensor])
|
||||
>>> tensor
|
||||
|
@ -48,6 +48,7 @@ class ShardingPlan(object):
|
||||
>>> return self.relu(self.fc2(self.gelu(self.fc1(input))))
|
||||
|
||||
|
||||
>>> # xdoctest: +SKIP("Undefined spec1, spec2)
|
||||
>>> sharding_plan = ShardingPlan(
|
||||
>>> plan={
|
||||
>>> "fc1.weight": spec1,
|
||||
|
@ -97,6 +97,7 @@ def register_ddp_comm_hook(
|
||||
Uses Python comm hook implementations.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> register_ddp_comm_hook(DDPCommHookType.FP16_COMPRESS, model, state)
|
||||
"""
|
||||
comm_hook_type.value(model=model, state=state)
|
||||
|
@ -19,6 +19,7 @@ def noop_hook(_: Any, bucket: GradBucket) -> torch.futures.Future[torch.Tensor]:
|
||||
some factors such as the overlap between allreduce and computation or the desynchronization across ranks.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> ddp_model.register_comm_hook(None, noop_hook)
|
||||
"""
|
||||
fut: torch.futures.Future[torch.Tensor] = torch.futures.Future()
|
||||
|
@ -33,6 +33,7 @@ def allreduce_hook(
|
||||
unaffecting DDP behavior.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> ddp_model.register_comm_hook(process_group, allreduce_hook)
|
||||
"""
|
||||
return _allreduce_fut(process_group, bucket.buffer())
|
||||
@ -49,6 +50,7 @@ def fp16_compress_hook(
|
||||
tensors are allreduced, the chained callback ``decompress`` casts it back to the input data type (such as ``float32``).
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> ddp_model.register_comm_hook(process_group, fp16_compress_hook)
|
||||
"""
|
||||
group_to_use = process_group if process_group is not None else dist.group.WORLD
|
||||
@ -84,6 +86,7 @@ def bf16_compress_hook(
|
||||
tensors are allreduced, the chained callback ``decompress`` casts it back to the input data type (such as ``float32``).
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> ddp_model.register_comm_hook(process_group, bf16_compress_hook)
|
||||
"""
|
||||
group_to_use = process_group if process_group is not None else dist.group.WORLD
|
||||
@ -116,6 +119,7 @@ def fp16_compress_wrapper(
|
||||
Therefore, ``fp16_compress_hook`` is equivalent to ``fp16_compress_wrapper(allreduce_hook)``.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
|
||||
>>> ddp_model.register_comm_hook(state, fp16_compress_wrapper(powerSGD_hook))
|
||||
"""
|
||||
@ -153,6 +157,7 @@ def bf16_compress_wrapper(
|
||||
Therefore, ``bf16_compress_hook`` is equivalent to ``bf16_compress_wrapper(allreduce_hook)``.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
|
||||
>>> ddp_model.register_comm_hook(state, bf16_compress_wrapper(powerSGD_hook))
|
||||
"""
|
||||
|
@ -82,6 +82,7 @@ def post_localSGD_hook(
|
||||
Future handler of the communication, which updates the gradients in place.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> state = PostLocalSGDState(process_group=process_group, subgroup=subgroup,
|
||||
start_localSGD_iter=10)
|
||||
>>> ddp_model.register_comm_hook(state, post_localSGD_hook)
|
||||
|
@ -379,6 +379,7 @@ def powerSGD_hook(
|
||||
Future handler of the communication, which updates the gradients in place.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1,
|
||||
start_powerSGD_iter=10, min_compression_rate=0.5)
|
||||
>>> ddp_model.register_comm_hook(state, powerSGD_hook)
|
||||
@ -687,6 +688,7 @@ def batched_powerSGD_hook(
|
||||
Future handler of the communication, which updates the gradients in place.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1)
|
||||
>>> ddp_model.register_comm_hook(state, batched_powerSGD_hook)
|
||||
""" # noqa: B950
|
||||
|
@ -59,6 +59,7 @@ def quantization_pertensor_hook(
|
||||
``allreduce`` protocol. It works only with flattened grads.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> ddp_model.register_comm_hook(process_group, quantization_pertensor_hook)
|
||||
"""
|
||||
group_to_use = process_group if process_group is not None else dist.group.WORLD
|
||||
@ -138,6 +139,7 @@ def quantization_perchannel_hook(
|
||||
``allreduce`` protocol. It works only with flattened grads.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> ddp_model.register_comm_hook(process_group, quantization_perchannel_hook)
|
||||
"""
|
||||
group_to_use = process_group if process_group is not None else dist.group.WORLD
|
||||
|
@ -150,6 +150,7 @@ class Join():
|
||||
>>> import torch
|
||||
>>> import torch.distributed as dist
|
||||
>>> import torch.multiprocessing as mp
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> import torch.nn.parallel.DistributedDataParallel as DDP
|
||||
>>> import torch.distributed.optim.ZeroRedundancyOptimizer as ZeRO
|
||||
>>> from torch.distributed.algorithms.join import Join
|
||||
|
@ -49,35 +49,36 @@ class PeriodicModelAverager(ModelAverager):
|
||||
|
||||
Example::
|
||||
|
||||
>>> import torch
|
||||
>>> import torch.distributed as dist
|
||||
>>> import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
|
||||
>>> import torch.distributed.algorithms.model_averaging.averagers as averagers
|
||||
>>> import torch.nn as nn
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> import torch
|
||||
>>> import torch.distributed as dist
|
||||
>>> import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
|
||||
>>> import torch.distributed.algorithms.model_averaging.averagers as averagers
|
||||
>>> import torch.nn as nn
|
||||
>>>
|
||||
>>> dist.init_process_group("nccl", rank=rank, world_size=16)
|
||||
>>> torch.cuda.set_device(rank)
|
||||
>>> module = nn.Linear(1, 1, bias=False).cuda()
|
||||
>>> model = nn.parallel.DistributedDataParallel(
|
||||
>>> module, device_ids=[rank], output_device=rank
|
||||
>>> )
|
||||
>>> # Register a post-localSGD communication hook.
|
||||
>>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
|
||||
>>> model.register_comm_hook(state, post_localSGD_hook)
|
||||
>>> dist.init_process_group("nccl", rank=rank, world_size=16)
|
||||
>>> torch.cuda.set_device(rank)
|
||||
>>> module = nn.Linear(1, 1, bias=False).cuda()
|
||||
>>> model = nn.parallel.DistributedDataParallel(
|
||||
>>> module, device_ids=[rank], output_device=rank
|
||||
>>> )
|
||||
>>> # Register a post-localSGD communication hook.
|
||||
>>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
|
||||
>>> model.register_comm_hook(state, post_localSGD_hook)
|
||||
>>>
|
||||
>>> # In the first 100 steps, run global gradient averaging like normal DDP at every step.
|
||||
>>> # After 100 steps, run model averaging every 4 steps.
|
||||
>>> # Note that ``warmup_steps`` must be the same as ``start_localSGD_iter`` used in ``PostLocalSGDState``.
|
||||
>>> averager = averagers.PeriodicModelAverager(period=4, warmup_steps=100)
|
||||
>>> for step in range(0, 200):
|
||||
>>> optimizer.zero_grad()
|
||||
>>> loss = loss_fn(output, labels)
|
||||
>>> loss.backward()
|
||||
>>> optimizer.step()
|
||||
>>> # Will average model parameters globally every 4 steps. Thus,
|
||||
>>> # inter-node communication only occurs every 4 iterations after
|
||||
>>> # the initial ``warmup_steps`` period.
|
||||
>>> averager.average_parameters(model.parameters())
|
||||
>>> # In the first 100 steps, run global gradient averaging like normal DDP at every step.
|
||||
>>> # After 100 steps, run model averaging every 4 steps.
|
||||
>>> # Note that ``warmup_steps`` must be the same as ``start_localSGD_iter`` used in ``PostLocalSGDState``.
|
||||
>>> averager = averagers.PeriodicModelAverager(period=4, warmup_steps=100)
|
||||
>>> for step in range(0, 200):
|
||||
>>> optimizer.zero_grad()
|
||||
>>> loss = loss_fn(output, labels)
|
||||
>>> loss.backward()
|
||||
>>> optimizer.step()
|
||||
>>> # Will average model parameters globally every 4 steps. Thus,
|
||||
>>> # inter-node communication only occurs every 4 iterations after
|
||||
>>> # the initial ``warmup_steps`` period.
|
||||
>>> averager.average_parameters(model.parameters())
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
@ -47,43 +47,44 @@ class HierarchicalModelAverager(averagers.ModelAverager):
|
||||
(default: ``None``)
|
||||
|
||||
Example::
|
||||
>>> from collections import OrderedDict
|
||||
>>> import torch
|
||||
>>> import torch.distributed as dist
|
||||
>>> from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
|
||||
>>> PostLocalSGDState,
|
||||
>>> post_localSGD_hook,
|
||||
>>> )
|
||||
>>> import torch.distributed.algorithms.model_averaging.hierarchical_model_averager as hierarchicalSGD
|
||||
>>> import torch.nn as nn
|
||||
>>> # xdoctest: +SKIP('undefined rank')
|
||||
>>> from collections import OrderedDict
|
||||
>>> import torch
|
||||
>>> import torch.distributed as dist
|
||||
>>> from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
|
||||
>>> PostLocalSGDState,
|
||||
>>> post_localSGD_hook,
|
||||
>>> )
|
||||
>>> import torch.distributed.algorithms.model_averaging.hierarchical_model_averager as hierarchicalSGD
|
||||
>>> import torch.nn as nn
|
||||
>>>
|
||||
>>> dist.init_process_group("nccl", rank=rank, world_size=16)
|
||||
>>> torch.cuda.set_device(rank)
|
||||
>>> module = nn.Linear(1, 1, bias=False).to(rank)
|
||||
>>> model = nn.parallel.DistributedDataParallel(
|
||||
>>> module, device_ids=[rank], output_device=rank
|
||||
>>> )
|
||||
>>> # Register a post-localSGD communication hook.
|
||||
>>> # Assume that each machine has 4 GPUs, then each intra-machine subgroup has a size of 4.
|
||||
>>> subgroup, _ = dist.new_subgroups()
|
||||
>>> state = PostLocalSGDState(subgroup=subgroup, start_localSGD_iter=100)
|
||||
>>> model.register_comm_hook(state, post_localSGD_hook)
|
||||
>>> dist.init_process_group("nccl", rank=rank, world_size=16)
|
||||
>>> torch.cuda.set_device(rank)
|
||||
>>> module = nn.Linear(1, 1, bias=False).to(rank)
|
||||
>>> model = nn.parallel.DistributedDataParallel(
|
||||
>>> module, device_ids=[rank], output_device=rank
|
||||
>>> )
|
||||
>>> # Register a post-localSGD communication hook.
|
||||
>>> # Assume that each machine has 4 GPUs, then each intra-machine subgroup has a size of 4.
|
||||
>>> subgroup, _ = dist.new_subgroups()
|
||||
>>> state = PostLocalSGDState(subgroup=subgroup, start_localSGD_iter=100)
|
||||
>>> model.register_comm_hook(state, post_localSGD_hook)
|
||||
>>>
|
||||
>>> # Average parameters among each group of 8 processes every 4 iterations, and among all
|
||||
>>> # the 16 processes every 16 iterations.
|
||||
>>> averager = hierarchicalSGD.HierarchicalModelAverager(
|
||||
>>> period_group_size_dict=OrderedDict([(4, 8), (16, 16)]), warmup_steps=100)
|
||||
>>> # Note that ``warmup_steps`` must be the same as ``start_localSGD_iter`` used in ``PostLocalSGDState``.
|
||||
>>> # In the first 100 steps, run global gradient averaging like normal DDP at every step.
|
||||
>>> # After 100 steps, run model averaging at two levels.
|
||||
>>> for step in range(0, 200):
|
||||
>>> optimizer.zero_grad()
|
||||
>>> loss = loss_fn(output, labels)
|
||||
>>> loss.backward()
|
||||
>>> optimizer.step()
|
||||
>>> # Average parameters after ``optimizer.step()``.
|
||||
>>> # Thus, the inter-node communication only occurs periodically after ``warmup_steps``.
|
||||
>>> averager.average_parameters(model.parameters())
|
||||
>>> # Average parameters among each group of 8 processes every 4 iterations, and among all
|
||||
>>> # the 16 processes every 16 iterations.
|
||||
>>> averager = hierarchicalSGD.HierarchicalModelAverager(
|
||||
>>> period_group_size_dict=OrderedDict([(4, 8), (16, 16)]), warmup_steps=100)
|
||||
>>> # Note that ``warmup_steps`` must be the same as ``start_localSGD_iter`` used in ``PostLocalSGDState``.
|
||||
>>> # In the first 100 steps, run global gradient averaging like normal DDP at every step.
|
||||
>>> # After 100 steps, run model averaging at two levels.
|
||||
>>> for step in range(0, 200):
|
||||
>>> optimizer.zero_grad()
|
||||
>>> loss = loss_fn(output, labels)
|
||||
>>> loss.backward()
|
||||
>>> optimizer.step()
|
||||
>>> # Average parameters after ``optimizer.step()``.
|
||||
>>> # Thus, the inter-node communication only occurs periodically after ``warmup_steps``.
|
||||
>>> averager.average_parameters(model.parameters())
|
||||
|
||||
.. warning ::
|
||||
The last group size in the dict must be the size of the provided ``process_group``,
|
||||
|
@ -36,6 +36,7 @@ class context(object):
|
||||
|
||||
Example::
|
||||
>>> import torch.distributed.autograd as dist_autograd
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> with dist_autograd.context() as context_id:
|
||||
>>> t1 = torch.rand((3, 3), requires_grad=True)
|
||||
>>> t2 = torch.rand((3, 3), requires_grad=True)
|
||||
|
@ -1125,6 +1125,7 @@ def batch_isend_irecv(p2p_op_list):
|
||||
op in the op_list.
|
||||
|
||||
Examples:
|
||||
>>> # xdoctest: +SKIP("no rank")
|
||||
>>> send_tensor = torch.arange(2) + 2 * rank
|
||||
>>> recv_tensor = torch.randn(2)
|
||||
>>> send_op = dist.P2POp(dist.isend, send_tensor, (rank + 1)%world_size)
|
||||
@ -1338,6 +1339,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False):
|
||||
None, if not async_op or if not part of the group
|
||||
|
||||
Examples:
|
||||
>>> # xdoctest: +SKIP("no rank")
|
||||
>>> # All tensors below are of torch.int64 type.
|
||||
>>> # We have 2 process groups, 2 ranks.
|
||||
>>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
|
||||
@ -1680,6 +1682,7 @@ def all_gather_object(object_list, obj, group=None):
|
||||
function with data you trust.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP("need process group init")
|
||||
>>> # Note: Process group initialization omitted on each rank.
|
||||
>>> import torch.distributed as dist
|
||||
>>> # Assumes world_size of 3.
|
||||
@ -1766,16 +1769,17 @@ def gather_object(obj, object_gather_list=None, dst=0, group=None):
|
||||
function with data you trust.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP("need process group init")
|
||||
>>> # Note: Process group initialization omitted on each rank.
|
||||
>>> import torch.distributed as dist
|
||||
>>> # Assumes world_size of 3.
|
||||
>>> gather_objects = ["foo", 12, {1: 2}] # any picklable object
|
||||
>>> output = [None for _ in gather_objects]
|
||||
>>> dist.gather_object(
|
||||
gather_objects[dist.get_rank()],
|
||||
output if dist.get_rank() == 0 else None,
|
||||
dst=0
|
||||
)
|
||||
... gather_objects[dist.get_rank()],
|
||||
... output if dist.get_rank() == 0 else None,
|
||||
... dst=0
|
||||
... )
|
||||
>>> # On rank 0
|
||||
>>> output
|
||||
['foo', 12, {1: 2}]
|
||||
@ -1871,6 +1875,7 @@ def broadcast_object_list(object_list, src=0, group=None, device=None):
|
||||
function with data you trust.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP("need process group init")
|
||||
>>> # Note: Process group initialization omitted on each rank.
|
||||
>>> import torch.distributed as dist
|
||||
>>> if dist.get_rank() == 0:
|
||||
@ -1968,6 +1973,7 @@ def scatter_object_list(
|
||||
function with data you trust.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP("need process group init")
|
||||
>>> # Note: Process group initialization omitted on each rank.
|
||||
>>> import torch.distributed as dist
|
||||
>>> if dist.get_rank() == 0:
|
||||
@ -2053,6 +2059,7 @@ def all_gather(tensor_list, tensor, group=None, async_op=False):
|
||||
None, if not async_op or if not part of the group
|
||||
|
||||
Examples:
|
||||
>>> # xdoctest: +SKIP("need process group init")
|
||||
>>> # All tensors below are of torch.int64 dtype.
|
||||
>>> # We have 2 process groups, 2 ranks.
|
||||
>>> tensor_list = [torch.zeros(2, dtype=torch.int64) for _ in range(2)]
|
||||
@ -2122,6 +2129,7 @@ def _all_gather_base(output_tensor, input_tensor, group=None, async_op=False):
|
||||
None, if not async_op or if not part of the group
|
||||
|
||||
Examples:
|
||||
>>> # xdoctest: +SKIP("need process group init")
|
||||
>>> # All tensors below are of torch.int64 dtype.
|
||||
>>> # We have 2 process groups, 2 ranks.
|
||||
>>> output_tensor = torch.zeros(2, dtype=torch.int64)
|
||||
@ -2563,6 +2571,7 @@ def all_to_all_single(
|
||||
`all_to_all_single` is experimental and subject to change.
|
||||
|
||||
Examples:
|
||||
>>> # xdoctest: +SKIP("Undefined rank")
|
||||
>>> input = torch.arange(4) + rank * 4
|
||||
>>> input
|
||||
tensor([0, 1, 2, 3]) # Rank 0
|
||||
@ -2678,6 +2687,7 @@ def all_to_all(output_tensor_list, input_tensor_list, group=None, async_op=False
|
||||
`all_to_all` is experimental and subject to change.
|
||||
|
||||
Examples:
|
||||
>>> # xdoctest: +SKIP("Undefined rank")
|
||||
>>> input = torch.arange(4) + rank * 4
|
||||
>>> input = list(input.chunk(4))
|
||||
>>> input
|
||||
@ -2858,6 +2868,7 @@ def monitored_barrier(group=GroupMember.WORLD, timeout=None, wait_all_ranks=Fals
|
||||
``None``.
|
||||
|
||||
Example::
|
||||
>>> # xdoctest: +SKIP("need process group init")
|
||||
>>> # Note: Process group initialization omitted on each rank.
|
||||
>>> import torch.distributed as dist
|
||||
>>> if dist.get_rank() != 1:
|
||||
@ -3115,6 +3126,7 @@ def new_subgroups(
|
||||
|
||||
Examples:
|
||||
>>> # Create intra-machine subgroups.
|
||||
>>> # xdoctest: +SKIP("need process group init")
|
||||
>>> cur_subgroup, subgroups = dist.new_subgroups()
|
||||
>>> # Allreduce within the machine.
|
||||
>>> rank = dist.get_rank()
|
||||
@ -3229,6 +3241,7 @@ def new_subgroups_by_enumeration(
|
||||
|
||||
Examples:
|
||||
>>> # Create two subgroups, where each has 2 processes.
|
||||
>>> # xdoctest: +SKIP("need process group init")
|
||||
>>> cur_subgroup, subgroups = dist.new_subgroups(ranks=[[0, 2], [1, 3]])
|
||||
>>> rank = dist.get_rank()
|
||||
>>> tensor = torch.ones(1, device=rank) * rank
|
||||
|
@ -293,8 +293,9 @@ class StateDictType(Enum):
|
||||
meaningful to FSDP (because parameters are flattened). Note that
|
||||
these APIs are meant for use via the :func:`state_dict_type`
|
||||
context manager as follows:
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> with fsdp.state_dict_type(StateDictType.LOCAL_STATE_DICT):
|
||||
>>> state = fsdp.state_dict() # loads local state dict
|
||||
... state = fsdp.state_dict() # loads local state dict
|
||||
3. ``_sharded_state_dict/_load_sharded_state_dict``: this pair of APIs
|
||||
return and load sharded, unflattened parameters. The ``state_dict``
|
||||
return by ``sharded_state_dict`` can be used by all other parallel
|
||||
@ -326,6 +327,7 @@ class FullStateDictConfig(StateDictConfig):
|
||||
together to optimize memory savings when taking checkpoints. Note that
|
||||
this config class is meant for user via the :func:`state_dict_type`
|
||||
context manager as follows:
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> fsdp = FSDP(model, auto_wrap_policy=...)
|
||||
>>> cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
|
||||
>>> with FullyShardedDataParallel.state_dict_type(fsdp, StateDictType.FULL_STATE_DICT, cfg):
|
||||
@ -470,6 +472,7 @@ class FullyShardedDataParallel(nn.Module):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> import torch
|
||||
>>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
||||
>>> torch.cuda.set_device(device_id)
|
||||
@ -623,9 +626,11 @@ class FullyShardedDataParallel(nn.Module):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> module = MyModule(device="meta")
|
||||
>>> def my_init_fn(module):
|
||||
>>> # responsible for initializing a module, such as with reset_parameters
|
||||
>>> ...
|
||||
>>> fsdp_model = FSDP(module, param_init_fn=my_init_fn, auto_wrap_policy=size_based_auto_wrap_policy)
|
||||
>>> print(next(fsdp_model.parameters()).device) # current CUDA device
|
||||
>>> # With torchdistX
|
||||
@ -1806,9 +1811,10 @@ class FullyShardedDataParallel(nn.Module):
|
||||
|
||||
Example::
|
||||
|
||||
>>> model = DDP(FSDP(...))
|
||||
>>> with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
|
||||
>>> checkpoint = model.state_dict()
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> model = DDP(FSDP(...))
|
||||
>>> with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
|
||||
>>> checkpoint = model.state_dict()
|
||||
|
||||
Args:
|
||||
module (torch.nn.Module): Root module.
|
||||
@ -2051,22 +2057,23 @@ class FullyShardedDataParallel(nn.Module):
|
||||
|
||||
Example::
|
||||
|
||||
>>> import torch
|
||||
>>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
||||
>>> from torch.distributed.fsdp import StateDictType
|
||||
>>> torch.cuda.set_device(device_id)
|
||||
>>> my_module = nn.Linear(...)
|
||||
>>> sharded_module = FSDP(my_module)
|
||||
>>> full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
|
||||
>>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT, full_state_dict_config):
|
||||
>>> full_dict = sharded_module.state_dict()
|
||||
>>> full_dict.keys()
|
||||
>>> odict_keys(['weight', 'bias'])
|
||||
>>> # using local state dict
|
||||
>>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT):
|
||||
>>> local_dict = sharded_module.state_dict()
|
||||
>>> local_dict.keys()
|
||||
>>> odict_keys(['flat_param', 'inner.flat_param'])
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> import torch
|
||||
>>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
||||
>>> from torch.distributed.fsdp import StateDictType
|
||||
>>> torch.cuda.set_device(device_id)
|
||||
>>> my_module = nn.Linear(...)
|
||||
>>> sharded_module = FSDP(my_module)
|
||||
>>> full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
|
||||
>>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT, full_state_dict_config):
|
||||
>>> full_dict = sharded_module.state_dict()
|
||||
>>> full_dict.keys()
|
||||
>>> odict_keys(['weight', 'bias'])
|
||||
>>> # using local state dict
|
||||
>>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT):
|
||||
>>> local_dict = sharded_module.state_dict()
|
||||
>>> local_dict.keys()
|
||||
>>> odict_keys(['flat_param', 'inner.flat_param'])
|
||||
|
||||
.. warning:: This needs to be called on all ranks, since synchronization
|
||||
primitives may be used.
|
||||
@ -2332,24 +2339,25 @@ class FullyShardedDataParallel(nn.Module):
|
||||
|
||||
Example::
|
||||
|
||||
>>> import torch
|
||||
>>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
||||
>>> from torch.distributed.fsdp import StateDictType
|
||||
>>> torch.cuda.set_device(device_id)
|
||||
>>> my_module = nn.Linear(...)
|
||||
>>> sharded_module = FSDP(my_module)
|
||||
>>> checkpoint = torch.load(PATH)
|
||||
>>> full_state_dict = checkpoint['full_state_dict']
|
||||
>>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT):
|
||||
>>> sharded_module.load_state_dict(full_state_dict)
|
||||
>>> full_dict.keys()
|
||||
>>> odict_keys(['weight', 'bias'])
|
||||
>>> # using local state dict
|
||||
>>> local_state_dict = checkpoint['local_state_dict']
|
||||
>>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT):
|
||||
>>> sharded_module.load_state_dict(local_state_dict)
|
||||
>>> local_dict.keys()
|
||||
>>> odict_keys(['flat_param', 'inner.flat_param'])
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> import torch
|
||||
>>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
||||
>>> from torch.distributed.fsdp import StateDictType
|
||||
>>> torch.cuda.set_device(device_id)
|
||||
>>> my_module = nn.Linear(...)
|
||||
>>> sharded_module = FSDP(my_module)
|
||||
>>> checkpoint = torch.load(PATH)
|
||||
>>> full_state_dict = checkpoint['full_state_dict']
|
||||
>>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT):
|
||||
>>> sharded_module.load_state_dict(full_state_dict)
|
||||
>>> full_dict.keys()
|
||||
>>> odict_keys(['weight', 'bias'])
|
||||
>>> # using local state dict
|
||||
>>> local_state_dict = checkpoint['local_state_dict']
|
||||
>>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT):
|
||||
>>> sharded_module.load_state_dict(local_state_dict)
|
||||
>>> local_dict.keys()
|
||||
>>> odict_keys(['flat_param', 'inner.flat_param'])
|
||||
|
||||
.. warning:: This needs to be called on all ranks, since synchronization
|
||||
primitives may be used.
|
||||
@ -3841,6 +3849,7 @@ class FullyShardedDataParallel(nn.Module):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
||||
>>> model, optim = ...
|
||||
>>> full_osd = FSDP.full_optim_state_dict(model, optim)
|
||||
@ -3908,6 +3917,7 @@ class FullyShardedDataParallel(nn.Module):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
||||
>>> model, optim = ...
|
||||
>>> full_osd = FSDP.full_optim_state_dict(model, optim) # only non-empty on rank 0
|
||||
@ -4004,6 +4014,7 @@ class FullyShardedDataParallel(nn.Module):
|
||||
:meth:`full_optim_state_dict`) to use parameter IDs and be loadable to
|
||||
a non-wrapped model::
|
||||
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> wrapped_model, wrapped_optim = ...
|
||||
>>> full_osd = FSDP.full_optim_state_dict(wrapped_model, wrapped_optim)
|
||||
>>> nonwrapped_model, nonwrapped_optim = ...
|
||||
@ -4013,6 +4024,7 @@ class FullyShardedDataParallel(nn.Module):
|
||||
To re-key a normal optimizer state dict from a non-wrapped model to be
|
||||
loadable to a wrapped model::
|
||||
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> nonwrapped_model, nonwrapped_optim = ...
|
||||
>>> osd = nonwrapped_optim.state_dict()
|
||||
>>> rekeyed_osd = FSDP.rekey_optim_state_dict(osd, OptimStateKeyType.PARAM_NAME, nonwrapped_model)
|
||||
|
@ -30,7 +30,7 @@ GPU (nproc_per_node - 1)*.
|
||||
|
||||
::
|
||||
|
||||
>>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
|
||||
python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
|
||||
YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
|
||||
arguments of your training script)
|
||||
|
||||
@ -41,7 +41,7 @@ Node 1: *(IP: 192.168.1.1, and has a free port: 1234)*
|
||||
|
||||
::
|
||||
|
||||
>>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
|
||||
python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
|
||||
--nnodes=2 --node_rank=0 --master_addr="192.168.1.1"
|
||||
--master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
|
||||
and all other arguments of your training script)
|
||||
@ -50,7 +50,7 @@ Node 2:
|
||||
|
||||
::
|
||||
|
||||
>>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
|
||||
python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
|
||||
--nnodes=2 --node_rank=1 --master_addr="192.168.1.1"
|
||||
--master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
|
||||
and all other arguments of your training script)
|
||||
@ -59,7 +59,7 @@ Node 2:
|
||||
|
||||
::
|
||||
|
||||
>>> python -m torch.distributed.launch --help
|
||||
python -m torch.distributed.launch --help
|
||||
|
||||
|
||||
**Important Notices:**
|
||||
@ -78,6 +78,7 @@ Parsing the local_rank argument
|
||||
|
||||
::
|
||||
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> import argparse
|
||||
>>> parser = argparse.ArgumentParser()
|
||||
>>> parser.add_argument("--local_rank", type=int)
|
||||
@ -95,6 +96,7 @@ or
|
||||
|
||||
>>> with torch.cuda.device(args.local_rank):
|
||||
>>> # your code to run
|
||||
>>> ...
|
||||
|
||||
3. In your training program, you are supposed to call the following function
|
||||
at the beginning to start the distributed backend. It is strongly recommended
|
||||
@ -103,8 +105,8 @@ but ``env://`` is the one that is officially supported by this module.
|
||||
|
||||
::
|
||||
|
||||
torch.distributed.init_process_group(backend='YOUR BACKEND',
|
||||
init_method='env://')
|
||||
>>> torch.distributed.init_process_group(backend='YOUR BACKEND',
|
||||
>>> init_method='env://')
|
||||
|
||||
4. In your training program, you can either use regular distributed functions
|
||||
or use :func:`torch.nn.parallel.DistributedDataParallel` module. If your
|
||||
@ -114,9 +116,9 @@ here is how to configure it.
|
||||
|
||||
::
|
||||
|
||||
model = torch.nn.parallel.DistributedDataParallel(model,
|
||||
device_ids=[args.local_rank],
|
||||
output_device=args.local_rank)
|
||||
>>> model = torch.nn.parallel.DistributedDataParallel(model,
|
||||
>>> device_ids=[args.local_rank],
|
||||
>>> output_device=args.local_rank)
|
||||
|
||||
Please ensure that ``device_ids`` argument is set to be the only GPU device id
|
||||
that your code will be operating on. This is generally the local rank of the
|
||||
|
@ -199,6 +199,7 @@ class _RemoteModule(nn.Module):
|
||||
>>> from torch import nn, Tensor
|
||||
>>> from torch.distributed.nn.api.remote_module import RemoteModule
|
||||
>>>
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> rpc.init_rpc("worker0", rank=0, world_size=2)
|
||||
>>> remote_linear_module = RemoteModule(
|
||||
>>> "worker1/cpu", nn.Linear, args=(20, 30),
|
||||
@ -505,6 +506,7 @@ class _RemoteModule(nn.Module):
|
||||
>>> from torch import nn, Tensor
|
||||
>>> from torch.distributed.nn.api.remote_module import RemoteModule
|
||||
>>>
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> rpc.init_rpc("worker0", rank=0, world_size=2)
|
||||
>>> remote_module = RemoteModule(
|
||||
>>> "worker1/cpu", nn.Linear, args=(20, 30),
|
||||
@ -626,6 +628,7 @@ class RemoteModule(_RemoteModule):
|
||||
>>> from torch import nn, Tensor
|
||||
>>> from torch.distributed.nn.api.remote_module import RemoteModule
|
||||
>>>
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> rpc.init_rpc("worker0", rank=0, world_size=2)
|
||||
>>> remote_linear_module = RemoteModule(
|
||||
>>> "worker1/cpu", nn.Linear, args=(20, 30),
|
||||
|
@ -134,6 +134,7 @@ def _all_gather_base(output_tensor, input_tensor, group=group.WORLD):
|
||||
Examples:
|
||||
>>> # All tensors below are of torch.int64 dtype.
|
||||
>>> # We have 2 process groups, 2 ranks.
|
||||
>>> # xdoctest: +SKIP("incorrect want text")
|
||||
>>> output_tensor = torch.zeros(2, dtype=torch.int64)
|
||||
>>> output_tensor
|
||||
[tensor([0, 0])] # Rank 0 and 1
|
||||
|
@ -168,6 +168,7 @@ class DistributedOptimizer:
|
||||
>>> from torch import optim
|
||||
>>> from torch.distributed.optim import DistributedOptimizer
|
||||
>>>
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> with dist_autograd.context() as context_id:
|
||||
>>> # Forward pass.
|
||||
>>> rref1 = rpc.remote("worker1", torch.add, args=(torch.ones(2), 3))
|
||||
|
@ -15,41 +15,42 @@ class PostLocalSGDOptimizer(torch.optim.Optimizer):
|
||||
|
||||
Example::
|
||||
|
||||
>>> import torch
|
||||
>>> import torch.distributed as dist
|
||||
>>> import torch.distributed.algorithms.model_averaging.averagers as averagers
|
||||
>>> import torch.nn as nn
|
||||
>>> from torch.distributed.optim import PostLocalSGDOptimizer
|
||||
>>> from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
|
||||
>>> PostLocalSGDState,
|
||||
>>> post_localSGD_hook,
|
||||
>>> )
|
||||
>>> # xdoctest: +SKIP("undefined variables")
|
||||
>>> import torch
|
||||
>>> import torch.distributed as dist
|
||||
>>> import torch.distributed.algorithms.model_averaging.averagers as averagers
|
||||
>>> import torch.nn as nn
|
||||
>>> from torch.distributed.optim import PostLocalSGDOptimizer
|
||||
>>> from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
|
||||
>>> PostLocalSGDState,
|
||||
>>> post_localSGD_hook,
|
||||
>>> )
|
||||
>>>
|
||||
>>> model = nn.parallel.DistributedDataParallel(
|
||||
>>> module, device_ids=[rank], output_device=rank
|
||||
>>> )
|
||||
>>> model = nn.parallel.DistributedDataParallel(
|
||||
>>> module, device_ids=[rank], output_device=rank
|
||||
>>> )
|
||||
>>>
|
||||
>>> # Register a post-localSGD communication hook.
|
||||
>>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
|
||||
>>> model.register_comm_hook(state, post_localSGD_hook)
|
||||
>>> # Register a post-localSGD communication hook.
|
||||
>>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
|
||||
>>> model.register_comm_hook(state, post_localSGD_hook)
|
||||
>>>
|
||||
>>> # Create a post-localSGD optimizer that wraps a local optimizer.
|
||||
>>> # Note that ``warmup_steps`` used in ``PostLocalSGDOptimizer`` must be the same as
|
||||
>>> # ``start_localSGD_iter`` used in ``PostLocalSGDState``.
|
||||
>>> local_optim = torch.optim.SGD(params=model.parameters(), lr=0.01)
|
||||
>>> opt = PostLocalSGDOptimizer(
|
||||
>>> optim=local_optim,
|
||||
>>> averager=averagers.PeriodicModelAverager(period=4, warmup_steps=100)
|
||||
>>> )
|
||||
>>> # Create a post-localSGD optimizer that wraps a local optimizer.
|
||||
>>> # Note that ``warmup_steps`` used in ``PostLocalSGDOptimizer`` must be the same as
|
||||
>>> # ``start_localSGD_iter`` used in ``PostLocalSGDState``.
|
||||
>>> local_optim = torch.optim.SGD(params=model.parameters(), lr=0.01)
|
||||
>>> opt = PostLocalSGDOptimizer(
|
||||
>>> optim=local_optim,
|
||||
>>> averager=averagers.PeriodicModelAverager(period=4, warmup_steps=100)
|
||||
>>> )
|
||||
>>>
|
||||
>>> # In the first 100 steps, DDP runs global gradient averaging at every step.
|
||||
>>> # After 100 steps, DDP runs gradient averaging within each subgroup (intra-node by default),
|
||||
>>> # and post-localSGD optimizer runs global model averaging every 4 steps after applying the local optimizer.
|
||||
>>> for step in range(0, 200):
|
||||
>>> opt.zero_grad()
|
||||
>>> loss = loss_fn(output, labels)
|
||||
>>> loss.backward()
|
||||
>>> opt.step()
|
||||
>>> # In the first 100 steps, DDP runs global gradient averaging at every step.
|
||||
>>> # After 100 steps, DDP runs gradient averaging within each subgroup (intra-node by default),
|
||||
>>> # and post-localSGD optimizer runs global model averaging every 4 steps after applying the local optimizer.
|
||||
>>> for step in range(0, 200):
|
||||
>>> opt.zero_grad()
|
||||
>>> loss = loss_fn(output, labels)
|
||||
>>> loss.backward()
|
||||
>>> opt.step()
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
@ -32,6 +32,7 @@ def register_functional_optim(key, optim):
|
||||
need not be of :class:`torch.optim.Optimizer` (e.g. for custom optimizers)
|
||||
Example::
|
||||
>>> # import the new functional optimizer
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> from xyz import fn_optimizer
|
||||
>>> from torch.distributed.optim.utils import register_functional_optim
|
||||
>>> fn_optim_key = "XYZ_optim"
|
||||
|
@ -331,6 +331,7 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable):
|
||||
>>> from torch.distributed.optim import ZeroRedundancyOptimizer
|
||||
>>> from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> model = nn.Sequential(*[nn.Linear(2000, 2000).to(rank) for _ in range(20)])
|
||||
>>> ddp = DDP(model, device_ids=[rank])
|
||||
>>> opt = ZeroRedundancyOptimizer(
|
||||
|
@ -156,6 +156,7 @@ class WithDevice(nn.Module):
|
||||
>>> # Dropout does not have any parameters/buffers, but we want to
|
||||
>>> # run it on cuda:1 to avoid any GPU to CPU transfers.
|
||||
>>> model = nn.Sequential(fc1, fc2, WithDevice(dropout, 'cuda:1'))
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> model = Pipe(model, chunks=8)
|
||||
"""
|
||||
def __init__(self, module: nn.Module, device: torch.device):
|
||||
@ -270,6 +271,7 @@ class Pipe(Module):
|
||||
Pipeline of two FC layers across GPUs 0 and 1.
|
||||
|
||||
>>> # Need to initialize RPC framework first.
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> os.environ['MASTER_ADDR'] = 'localhost'
|
||||
>>> os.environ['MASTER_PORT'] = '29500'
|
||||
>>> torch.distributed.rpc.init_rpc('worker', rank=0, world_size=1)
|
||||
|
@ -160,6 +160,7 @@ def _wait_all():
|
||||
>>> # On worker 0:
|
||||
>>> import torch
|
||||
>>> import torch.distributed.rpc as rpc
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> rpc.init_rpc("worker0", rank=0, world_size=2)
|
||||
>>> with rpc._wait_all():
|
||||
>>> fut_1 = rpc.rpc_async(dst, torch.add, (torch.ones(2, 2), 1))
|
||||
@ -331,11 +332,12 @@ def shutdown(graceful=True, timeout=DEFAULT_SHUTDOWN_TIMEOUT):
|
||||
on both workers. Refer to :meth:`~torch.distributed.init_process_group`
|
||||
API for more details. For example,
|
||||
|
||||
>>> export MASTER_ADDR=localhost
|
||||
>>> export MASTER_PORT=5678
|
||||
export MASTER_ADDR=localhost
|
||||
export MASTER_PORT=5678
|
||||
|
||||
Then run the following code in two different processes:
|
||||
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> # On worker 0:
|
||||
>>> import torch
|
||||
>>> import torch.distributed.rpc as rpc
|
||||
@ -574,15 +576,17 @@ def remote(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
|
||||
raised as they have not yet been handled.
|
||||
|
||||
Example::
|
||||
|
||||
Make sure that ``MASTER_ADDR`` and ``MASTER_PORT`` are set properly
|
||||
on both workers. Refer to :meth:`~torch.distributed.init_process_group`
|
||||
API for more details. For example,
|
||||
|
||||
>>> export MASTER_ADDR=localhost
|
||||
>>> export MASTER_PORT=5678
|
||||
export MASTER_ADDR=localhost
|
||||
export MASTER_PORT=5678
|
||||
|
||||
Then run the following code in two different processes:
|
||||
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> # On worker 0:
|
||||
>>> import torch
|
||||
>>> import torch.distributed.rpc as rpc
|
||||
@ -759,11 +763,12 @@ def rpc_sync(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
|
||||
on both workers. Refer to :meth:`~torch.distributed.init_process_group`
|
||||
API for more details. For example,
|
||||
|
||||
>>> export MASTER_ADDR=localhost
|
||||
>>> export MASTER_PORT=5678
|
||||
export MASTER_ADDR=localhost
|
||||
export MASTER_PORT=5678
|
||||
|
||||
Then run the following code in two different processes:
|
||||
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> # On worker 0:
|
||||
>>> import torch
|
||||
>>> import torch.distributed.rpc as rpc
|
||||
@ -850,11 +855,12 @@ def rpc_async(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
|
||||
on both workers. Refer to :meth:`~torch.distributed.init_process_group`
|
||||
API for more details. For example,
|
||||
|
||||
>>> export MASTER_ADDR=localhost
|
||||
>>> export MASTER_PORT=5678
|
||||
export MASTER_ADDR=localhost
|
||||
export MASTER_PORT=5678
|
||||
|
||||
Then run the following code in two different processes:
|
||||
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> # On worker 0:
|
||||
>>> import torch
|
||||
>>> import torch.distributed.rpc as rpc
|
||||
|
@ -56,6 +56,7 @@ def async_execution(fn):
|
||||
>>> )
|
||||
>>>
|
||||
>>> # On worker0
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> ret = rpc.rpc_sync(
|
||||
>>> "worker1",
|
||||
>>> async_add_chained,
|
||||
|
@ -127,6 +127,7 @@ class TensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
|
||||
>>> options.set_device_map("worker1", {1: 2})
|
||||
>>> # maps worker0's cuda:1 to worker1's cuda:2
|
||||
>>>
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> rpc.init_rpc(
|
||||
>>> "worker0",
|
||||
>>> rank=0,
|
||||
|
@ -55,6 +55,7 @@ class _server_process_global_profile(profile):
|
||||
please use ``use_cuda = False`` or ``num_workers = 0``.
|
||||
|
||||
Example:
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> # On worker 0:
|
||||
>>> import torch
|
||||
>>> import torch.distributed.rpc as rpc
|
||||
@ -67,7 +68,7 @@ class _server_process_global_profile(profile):
|
||||
>>> inner_profile_rref.rpc_sync().__enter__()
|
||||
>>> rpc.rpc_sync(dst_worker_name, torch.sub, (x, y))
|
||||
>>> inner_profile_rref.rpc_sync().__exit__(None, None, None)
|
||||
>>> outer_profile_rref.rpc_sync().__exit__(None, None, None
|
||||
>>> outer_profile_rref.rpc_sync().__exit__(None, None, None)
|
||||
>>> print(inner_profile_rref.rpc_sync().key_averages())
|
||||
--------- --------------- --------------- --------------- --------------- --------------- ---------------
|
||||
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg Number of Calls
|
||||
|
@ -82,7 +82,7 @@ Single-node multi-worker
|
||||
|
||||
::
|
||||
|
||||
>>> torchrun
|
||||
torchrun
|
||||
--standalone
|
||||
--nnodes=1
|
||||
--nproc_per_node=$NUM_TRAINERS
|
||||
@ -101,7 +101,7 @@ port automatically instead of manually assgining different ports for each run.
|
||||
|
||||
::
|
||||
|
||||
>>> torchrun
|
||||
torchrun
|
||||
--rdzv_backend=c10d
|
||||
--rdzv_endpoint=localhost:0
|
||||
--nnodes=1
|
||||
@ -114,7 +114,7 @@ Fault tolerant (fixed sized number of workers, no elasticity, tolerates 3 failur
|
||||
|
||||
::
|
||||
|
||||
>>> torchrun
|
||||
torchrun
|
||||
--nnodes=$NUM_NODES
|
||||
--nproc_per_node=$NUM_TRAINERS
|
||||
--max_restarts=3
|
||||
@ -135,7 +135,7 @@ Elastic (``min=1``, ``max=4``, tolerates up to 3 membership changes or failures)
|
||||
|
||||
::
|
||||
|
||||
>>> torchrun
|
||||
torchrun
|
||||
--nnodes=1:4
|
||||
--nproc_per_node=$NUM_TRAINERS
|
||||
--max_restarts=3
|
||||
@ -294,6 +294,7 @@ Important Notices
|
||||
|
||||
::
|
||||
|
||||
>>> # xdoctest: +SKIP("stub")
|
||||
>>> import torch.distributed as dist
|
||||
>>> dist.init_process_group(backend="gloo|nccl")
|
||||
|
||||
|
@ -19,6 +19,7 @@ class Bernoulli(ExponentialFamily):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Bernoulli(torch.tensor([0.3]))
|
||||
>>> m.sample() # 30% chance 1; 70% chance 0
|
||||
tensor([ 0.])
|
||||
|
@ -14,6 +14,7 @@ class Beta(ExponentialFamily):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Beta(torch.tensor([0.5]), torch.tensor([0.5]))
|
||||
>>> m.sample() # Beta distributed with concentration concentration1 and concentration0
|
||||
tensor([ 0.1046])
|
||||
|
@ -18,6 +18,7 @@ class Binomial(Distribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Binomial(100, torch.tensor([0 , .2, .8, 1]))
|
||||
>>> x = m.sample()
|
||||
tensor([ 0., 22., 71., 100.])
|
||||
|
@ -35,6 +35,7 @@ class Categorical(Distribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Categorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
|
||||
>>> m.sample() # equal probability of 0, 1, 2, 3
|
||||
tensor(3)
|
||||
|
@ -17,6 +17,7 @@ class Cauchy(Distribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Cauchy(torch.tensor([0.0]), torch.tensor([1.0]))
|
||||
>>> m.sample() # sample from a Cauchy distribution with loc=0 and scale=1
|
||||
tensor([ 2.3214])
|
||||
|
@ -10,6 +10,7 @@ class Chi2(Gamma):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Chi2(torch.tensor([1.0]))
|
||||
>>> m.sample() # Chi2 distributed with shape df=1
|
||||
tensor([ 0.1046])
|
||||
|
@ -22,6 +22,7 @@ class ContinuousBernoulli(ExponentialFamily):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = ContinuousBernoulli(torch.tensor([0.3]))
|
||||
>>> m.sample()
|
||||
tensor([ 0.2538])
|
||||
|
@ -33,6 +33,7 @@ class Dirichlet(ExponentialFamily):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Dirichlet(torch.tensor([0.5, 0.5]))
|
||||
>>> m.sample() # Dirichlet distributed with concentration [0.5, 0.5]
|
||||
tensor([ 0.1046, 0.8954])
|
||||
|
@ -13,6 +13,7 @@ class Exponential(ExponentialFamily):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Exponential(torch.tensor([1.0]))
|
||||
>>> m.sample() # Exponential distributed with rate=1
|
||||
tensor([ 0.1046])
|
||||
|
@ -14,6 +14,7 @@ class FisherSnedecor(Distribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = FisherSnedecor(torch.tensor([1.0]), torch.tensor([2.0]))
|
||||
>>> m.sample() # Fisher-Snedecor-distributed with df1=1 and df2=2
|
||||
tensor([ 0.2453])
|
||||
|
@ -17,6 +17,7 @@ class Gamma(ExponentialFamily):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Gamma(torch.tensor([1.0]), torch.tensor([1.0]))
|
||||
>>> m.sample() # Gamma distributed with concentration=1 and rate=1
|
||||
tensor([ 0.1046])
|
||||
|
@ -19,6 +19,7 @@ class Geometric(Distribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Geometric(torch.tensor([0.3]))
|
||||
>>> m.sample() # underlying Bernoulli has 30% chance 1; 70% chance 0
|
||||
tensor([ 2.])
|
||||
|
@ -15,6 +15,7 @@ class Gumbel(TransformedDistribution):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Gumbel(torch.tensor([1.0]), torch.tensor([2.0]))
|
||||
>>> m.sample() # sample from Gumbel distribution with loc=1, scale=2
|
||||
tensor([ 1.0124])
|
||||
|
@ -18,6 +18,7 @@ class HalfCauchy(TransformedDistribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = HalfCauchy(torch.tensor([1.0]))
|
||||
>>> m.sample() # half-cauchy distributed with scale=1
|
||||
tensor([ 2.3214])
|
||||
|
@ -18,6 +18,7 @@ class HalfNormal(TransformedDistribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = HalfNormal(torch.tensor([1.0]))
|
||||
>>> m.sample() # half-normal distributed with scale=1
|
||||
tensor([ 0.1046])
|
||||
|
@ -15,17 +15,19 @@ class Independent(Distribution):
|
||||
the same shape as a Multivariate Normal distribution (so they are
|
||||
interchangeable), you can::
|
||||
|
||||
>>> from torch.distributions.multivariate_normal import MultivariateNormal
|
||||
>>> from torch.distributions.normal import Normal
|
||||
>>> loc = torch.zeros(3)
|
||||
>>> scale = torch.ones(3)
|
||||
>>> mvn = MultivariateNormal(loc, scale_tril=torch.diag(scale))
|
||||
>>> [mvn.batch_shape, mvn.event_shape]
|
||||
[torch.Size(()), torch.Size((3,))]
|
||||
[torch.Size([]), torch.Size([3])]
|
||||
>>> normal = Normal(loc, scale)
|
||||
>>> [normal.batch_shape, normal.event_shape]
|
||||
[torch.Size((3,)), torch.Size(())]
|
||||
[torch.Size([3]), torch.Size([])]
|
||||
>>> diagn = Independent(normal, 1)
|
||||
>>> [diagn.batch_shape, diagn.event_shape]
|
||||
[torch.Size(()), torch.Size((3,))]
|
||||
[torch.Size([]), torch.Size([3])]
|
||||
|
||||
Args:
|
||||
base_distribution (torch.distributions.distribution.Distribution): a
|
||||
|
@ -23,6 +23,7 @@ class Kumaraswamy(TransformedDistribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Kumaraswamy(torch.tensor([1.0]), torch.tensor([1.0]))
|
||||
>>> m.sample() # sample from a Kumaraswamy distribution with concentration alpha=1 and beta=1
|
||||
tensor([ 0.1729])
|
||||
|
@ -12,6 +12,7 @@ class Laplace(Distribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Laplace(torch.tensor([0.0]), torch.tensor([1.0]))
|
||||
>>> m.sample() # Laplace distributed with loc=0, scale=1
|
||||
tensor([ 0.1046])
|
||||
|
@ -34,6 +34,7 @@ class LKJCholesky(Distribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> l = LKJCholesky(3, 0.5)
|
||||
>>> l.sample() # l @ l.T is a sample of a correlation 3x3 matrix
|
||||
tensor([[ 1.0000, 0.0000, 0.0000],
|
||||
|
@ -15,6 +15,7 @@ class LogNormal(TransformedDistribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = LogNormal(torch.tensor([0.0]), torch.tensor([1.0]))
|
||||
>>> m.sample() # log-normal distributed with mean=0 and stddev=1
|
||||
tensor([ 0.1046])
|
||||
|
@ -22,7 +22,8 @@ class LogisticNormal(TransformedDistribution):
|
||||
|
||||
>>> # logistic-normal distributed with mean=(0, 0, 0) and stddev=(1, 1, 1)
|
||||
>>> # of the base Normal distribution
|
||||
>>> m = distributions.LogisticNormal(torch.tensor([0.0] * 3), torch.tensor([1.0] * 3))
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = LogisticNormal(torch.tensor([0.0] * 3), torch.tensor([1.0] * 3))
|
||||
>>> m.sample()
|
||||
tensor([ 0.7653, 0.0341, 0.0579, 0.1427])
|
||||
|
||||
|
@ -52,7 +52,7 @@ class LowRankMultivariateNormal(Distribution):
|
||||
covariance_matrix = cov_factor @ cov_factor.T + cov_diag
|
||||
|
||||
Example:
|
||||
|
||||
>>> # xdoctest: +REQUIRES(--lapack)
|
||||
>>> m = LowRankMultivariateNormal(torch.zeros(2), torch.tensor([[1.], [0.]]), torch.ones(2))
|
||||
>>> m.sample() # normally distributed with mean=`[0,0]`, cov_factor=`[[1],[0]]`, cov_diag=`[1,1]`
|
||||
tensor([-0.2102, -0.5429])
|
||||
|
@ -17,24 +17,25 @@ class MixtureSameFamily(Distribution):
|
||||
|
||||
Examples::
|
||||
|
||||
# Construct Gaussian Mixture Model in 1D consisting of 5 equally
|
||||
# weighted normal distributions
|
||||
>>> # xdoctest: +SKIP("undefined vars")
|
||||
>>> # Construct Gaussian Mixture Model in 1D consisting of 5 equally
|
||||
>>> # weighted normal distributions
|
||||
>>> mix = D.Categorical(torch.ones(5,))
|
||||
>>> comp = D.Normal(torch.randn(5,), torch.rand(5,))
|
||||
>>> gmm = MixtureSameFamily(mix, comp)
|
||||
|
||||
# Construct Gaussian Mixture Modle in 2D consisting of 5 equally
|
||||
# weighted bivariate normal distributions
|
||||
>>> # Construct Gaussian Mixture Modle in 2D consisting of 5 equally
|
||||
>>> # weighted bivariate normal distributions
|
||||
>>> mix = D.Categorical(torch.ones(5,))
|
||||
>>> comp = D.Independent(D.Normal(
|
||||
torch.randn(5,2), torch.rand(5,2)), 1)
|
||||
... torch.randn(5,2), torch.rand(5,2)), 1)
|
||||
>>> gmm = MixtureSameFamily(mix, comp)
|
||||
|
||||
# Construct a batch of 3 Gaussian Mixture Models in 2D each
|
||||
# consisting of 5 random weighted bivariate normal distributions
|
||||
>>> # Construct a batch of 3 Gaussian Mixture Models in 2D each
|
||||
>>> # consisting of 5 random weighted bivariate normal distributions
|
||||
>>> mix = D.Categorical(torch.rand(3,5))
|
||||
>>> comp = D.Independent(D.Normal(
|
||||
torch.randn(3,5,2), torch.rand(3,5,2)), 1)
|
||||
... torch.randn(3,5,2), torch.rand(3,5,2)), 1)
|
||||
>>> gmm = MixtureSameFamily(mix, comp)
|
||||
|
||||
Args:
|
||||
|
@ -32,6 +32,7 @@ class Multinomial(Distribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP("FIXME: found invalid values")
|
||||
>>> m = Multinomial(100, torch.tensor([ 1., 1., 1., 1.]))
|
||||
>>> x = m.sample() # equal probability of 0, 1, 2, 3
|
||||
tensor([ 21., 24., 30., 25.])
|
||||
|
@ -91,6 +91,7 @@ class MultivariateNormal(Distribution):
|
||||
|
||||
Example:
|
||||
|
||||
>>> # xdoctest: +REQUIRES(--lapack)
|
||||
>>> m = MultivariateNormal(torch.zeros(2), torch.eye(2))
|
||||
>>> m.sample() # normally distributed with mean=`[0,0]` and covariance_matrix=`I`
|
||||
tensor([-0.2102, -0.5429])
|
||||
|
@ -16,6 +16,7 @@ class Normal(ExponentialFamily):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Normal(torch.tensor([0.0]), torch.tensor([1.0]))
|
||||
>>> m.sample() # normally distributed with loc=0 and scale=1
|
||||
tensor([ 0.1046])
|
||||
|
@ -25,6 +25,7 @@ class OneHotCategorical(Distribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = OneHotCategorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
|
||||
>>> m.sample() # equal probability of 0, 1, 2, 3
|
||||
tensor([ 0., 0., 0., 1.])
|
||||
|
@ -12,6 +12,7 @@ class Pareto(TransformedDistribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Pareto(torch.tensor([1.0]), torch.tensor([1.0]))
|
||||
>>> m.sample() # sample from a Pareto distribution with scale=1 and alpha=1
|
||||
tensor([ 1.5623])
|
||||
|
@ -18,6 +18,7 @@ class Poisson(ExponentialFamily):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +SKIP("poisson_cpu not implemented for 'Long'")
|
||||
>>> m = Poisson(torch.tensor([4]))
|
||||
>>> m.sample()
|
||||
tensor([ 3.])
|
||||
|
@ -100,8 +100,9 @@ class RelaxedBernoulli(TransformedDistribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = RelaxedBernoulli(torch.tensor([2.2]),
|
||||
torch.tensor([0.1, 0.2, 0.3, 0.99]))
|
||||
... torch.tensor([0.1, 0.2, 0.3, 0.99]))
|
||||
>>> m.sample()
|
||||
tensor([ 0.2951, 0.3442, 0.8918, 0.9021])
|
||||
|
||||
|
@ -94,8 +94,9 @@ class RelaxedOneHotCategorical(TransformedDistribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = RelaxedOneHotCategorical(torch.tensor([2.2]),
|
||||
torch.tensor([0.1, 0.2, 0.3, 0.4]))
|
||||
... torch.tensor([0.1, 0.2, 0.3, 0.4]))
|
||||
>>> m.sample()
|
||||
tensor([ 0.1294, 0.2324, 0.3859, 0.2523])
|
||||
|
||||
|
@ -15,6 +15,7 @@ class StudentT(Distribution):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = StudentT(torch.tensor([2.0]))
|
||||
>>> m.sample() # Student's t-distributed with degrees of freedom=2
|
||||
tensor([ 0.1046])
|
||||
|
@ -17,6 +17,7 @@ class Uniform(Distribution):
|
||||
|
||||
>>> m = Uniform(torch.tensor([0.0]), torch.tensor([5.0]))
|
||||
>>> m.sample() # uniformly distributed in the range [0.0, 5.0)
|
||||
>>> # xdoctest: +SKIP
|
||||
tensor([ 2.3418])
|
||||
|
||||
Args:
|
||||
|
@ -75,7 +75,8 @@ class VonMises(Distribution):
|
||||
interpreted as angles modulo 2 pi.
|
||||
|
||||
Example::
|
||||
>>> m = dist.VonMises(torch.tensor([1.0]), torch.tensor([1.0]))
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = VonMises(torch.tensor([1.0]), torch.tensor([1.0]))
|
||||
>>> m.sample() # von Mises distributed with loc=1 and concentration=1
|
||||
tensor([1.9777])
|
||||
|
||||
|
@ -14,6 +14,7 @@ class Weibull(TransformedDistribution):
|
||||
|
||||
Example:
|
||||
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterinistic")
|
||||
>>> m = Weibull(torch.tensor([1.0]), torch.tensor([1.0]))
|
||||
>>> m.sample() # sample from a Weibull distribution with scale=1, concentration=1
|
||||
tensor([ 0.4784])
|
||||
|
@ -33,9 +33,10 @@ class Wishart(ExponentialFamily):
|
||||
or its Cholesky decomposition :math:`\mathbf{\Sigma} = \mathbf{L}\mathbf{L}^\top`
|
||||
|
||||
Example:
|
||||
>>> # xdoctest: +SKIP("FIXME: scale_tril must be at least two-dimensional")
|
||||
>>> m = Wishart(torch.eye(2), torch.Tensor([2]))
|
||||
>>> m.sample() # Wishart distributed with mean=`df * I` and
|
||||
# variance(x_ij)=`df` for i != j and variance(x_ij)=`2 * df` for i == j
|
||||
>>> # variance(x_ij)=`df` for i != j and variance(x_ij)=`2 * df` for i == j
|
||||
|
||||
Args:
|
||||
covariance_matrix (Tensor): positive-definite covariance matrix
|
||||
|
@ -257,15 +257,18 @@ def einsum(*args: Any) -> Tensor:
|
||||
|
||||
Examples::
|
||||
|
||||
# trace
|
||||
>>> # trace
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
|
||||
>>> torch.einsum('ii', torch.randn(4, 4))
|
||||
tensor(-1.2104)
|
||||
|
||||
# diagonal
|
||||
>>> # diagonal
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
|
||||
>>> torch.einsum('ii->i', torch.randn(4, 4))
|
||||
tensor([-0.1034, 0.7952, -0.2433, 0.4545])
|
||||
|
||||
# outer product
|
||||
>>> # outer product
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
|
||||
>>> x = torch.randn(5)
|
||||
>>> y = torch.randn(4)
|
||||
>>> torch.einsum('i,j->ij', x, y)
|
||||
@ -275,7 +278,8 @@ def einsum(*args: Any) -> Tensor:
|
||||
[ 0.1713, -0.4291, -0.5802, 0.7350],
|
||||
[ 0.5704, -1.4290, -1.9323, 2.4480]])
|
||||
|
||||
# batch matrix multiplication
|
||||
>>> # batch matrix multiplication
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
|
||||
>>> As = torch.randn(3,2,5)
|
||||
>>> Bs = torch.randn(3,5,4)
|
||||
>>> torch.einsum('bij,bjk->bik', As, Bs)
|
||||
@ -288,7 +292,8 @@ def einsum(*args: Any) -> Tensor:
|
||||
[[ 2.8153, 1.8787, -4.3839, -1.2112],
|
||||
[ 0.3728, -2.1131, 0.0921, 0.8305]]])
|
||||
|
||||
# with sublist format and ellipsis
|
||||
>>> # with sublist format and ellipsis
|
||||
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
|
||||
>>> torch.einsum(As, [..., 0, 1], Bs, [..., 1, 2], [..., 0, 2])
|
||||
tensor([[[-1.0564, -1.5904, 3.2023, 3.1271],
|
||||
[-1.6706, -0.8097, -0.8025, -2.1183]],
|
||||
@ -299,12 +304,12 @@ def einsum(*args: Any) -> Tensor:
|
||||
[[ 2.8153, 1.8787, -4.3839, -1.2112],
|
||||
[ 0.3728, -2.1131, 0.0921, 0.8305]]])
|
||||
|
||||
# batch permute
|
||||
>>> # batch permute
|
||||
>>> A = torch.randn(2, 3, 4, 5)
|
||||
>>> torch.einsum('...ij->...ji', A).shape
|
||||
torch.Size([2, 3, 5, 4])
|
||||
|
||||
# equivalent to torch.nn.functional.bilinear
|
||||
>>> # equivalent to torch.nn.functional.bilinear
|
||||
>>> A = torch.randn(3,5,4)
|
||||
>>> l = torch.randn(2,5)
|
||||
>>> r = torch.randn(2,4)
|
||||
@ -453,6 +458,7 @@ else:
|
||||
>>> z = torch.sin(torch.sqrt(x * x + y * y))
|
||||
>>> ax = plt.axes(projection='3d')
|
||||
>>> ax.plot_surface(x.numpy(), y.numpy(), z.numpy())
|
||||
>>> # xdoctest: +SKIP
|
||||
<mpl_toolkits.mplot3d.art3d.Poly3DCollection object at 0x7f8f30d40100>
|
||||
>>> plt.show()
|
||||
|
||||
@ -730,6 +736,7 @@ def _unique_impl(input: Tensor, sorted: bool = True,
|
||||
|
||||
>>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long))
|
||||
>>> output
|
||||
>>> # xdoctest: +SKIP
|
||||
tensor([ 2, 3, 1])
|
||||
|
||||
>>> output, inverse_indices = torch.unique(
|
||||
@ -1014,6 +1021,7 @@ def tensordot(a, b, dims=2, out: Optional[torch.Tensor] = None): # noqa: F811
|
||||
|
||||
>>> a = torch.randn(3, 4, 5, device='cuda')
|
||||
>>> b = torch.randn(4, 5, 6, device='cuda')
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> c = torch.tensordot(a, b, dims=2).cpu()
|
||||
tensor([[ 8.3504, -2.5436, 6.2922, 2.7556, -1.0732, 3.2741],
|
||||
[ 3.3161, 0.0704, 5.0187, -0.4079, -4.3126, 4.8744],
|
||||
@ -1081,6 +1089,7 @@ def cartesian_prod(*tensors):
|
||||
|
||||
>>> a = [1, 2, 3]
|
||||
>>> b = [4, 5]
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> list(itertools.product(a, b))
|
||||
[(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)]
|
||||
>>> tensor_a = torch.tensor(a)
|
||||
@ -1203,6 +1212,7 @@ def atleast_1d(*tensors):
|
||||
|
||||
>>> x = torch.randn(2)
|
||||
>>> x
|
||||
>>> # xdoctest: +SKIP
|
||||
tensor([1.4584, 0.7583])
|
||||
>>> torch.atleast_1d(x)
|
||||
tensor([1.4584, 0.7583])
|
||||
@ -1243,6 +1253,7 @@ def atleast_2d(*tensors):
|
||||
tensor([[1.]])
|
||||
>>> x = torch.randn(2,2)
|
||||
>>> x
|
||||
>>> # xdoctest: +SKIP
|
||||
tensor([[2.2086, 2.5165],
|
||||
[0.1757, 0.5194]])
|
||||
>>> torch.atleast_2d(x)
|
||||
@ -1280,6 +1291,7 @@ def atleast_3d(*tensors):
|
||||
tensor([[[0.5000]]])
|
||||
>>> y = torch.randn(2,2)
|
||||
>>> y
|
||||
>>> # xdoctest: +SKIP
|
||||
tensor([[-0.8079, 0.7460],
|
||||
[-1.1647, 1.4734]])
|
||||
>>> torch.atleast_3d(y)
|
||||
@ -1414,6 +1426,7 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa
|
||||
>>> a = torch.arange(9, dtype= torch.float) - 4
|
||||
>>> b = a.reshape((3, 3))
|
||||
>>> torch.norm(a)
|
||||
>>> # xdoctest: +SKIP
|
||||
tensor(7.7460)
|
||||
>>> torch.norm(b)
|
||||
tensor(7.7460)
|
||||
@ -1529,6 +1542,7 @@ def chain_matmul(*matrices, out=None):
|
||||
>>> c = torch.randn(5, 6)
|
||||
>>> d = torch.randn(6, 7)
|
||||
>>> torch.chain_matmul(a, b, c, d)
|
||||
>>> # xdoctest: +SKIP
|
||||
tensor([[ -2.3375, -3.9790, -4.1119, -6.6577, 9.5609, -11.5095, -3.2614],
|
||||
[ 21.4038, 3.3378, -8.4982, -5.2457, -10.2561, -2.4684, 2.7163],
|
||||
[ -0.9647, -5.8917, -2.3213, -5.2284, 12.8615, -12.2816, -2.5095]])
|
||||
@ -1621,6 +1635,7 @@ def _lu_impl(A, pivot=True, get_infos=False, out=None):
|
||||
|
||||
Example::
|
||||
|
||||
>>> # xdoctest: +REQUIRES(--lapack)
|
||||
>>> A = torch.randn(2, 3, 3)
|
||||
>>> A_LU, pivots = torch.lu(A)
|
||||
>>> A_LU
|
||||
|
@ -37,6 +37,7 @@ def _reify(o, s):
|
||||
|
||||
def reify(e, s):
|
||||
""" Replace variables of expression with substitution
|
||||
>>> # xdoctest: +SKIP
|
||||
>>> x, y = var(), var()
|
||||
>>> e = (1, x, (3, y))
|
||||
>>> s = {x: 2, y: 4}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user