diff --git a/.circleci/docker/requirements-ci.txt b/.circleci/docker/requirements-ci.txt
index 8b18a1745808..5662eadc4f66 100644
--- a/.circleci/docker/requirements-ci.txt
+++ b/.circleci/docker/requirements-ci.txt
@@ -164,11 +164,16 @@ pytest-rerunfailures
 #Pinned versions:
 #test that import:
 
-#xdoctest
+xdoctest==1.0.2
 #Description: runs doctests in pytest
-#Pinned versions:
+#Pinned versions: 1.0.2
 #test that import:
 
+pygments==2.12.0
+#Description: support doctest highlighting
+#Pinned versions: 2.12.0
+#test that import: the doctests
+
 #PyYAML
 #Description: data serialization format
 #Pinned versions:
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 68f7f2619209..323ce3965a86 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -16,9 +16,9 @@ fi
 pip install "unittest-xml-reporting<=3.2.0,>=2.0.0" \
   pytest \
   pytest-xdist \
-  pytest-rerunfailures
-  # TODO: enable xdoctest later
-  # xdoctest
+  pytest-rerunfailures \
+  "xdoctest==1.0.2" \
+  "pygments==2.12.0"
 
 if [ -z "${CI}" ]; then
   rm -rf "${WORKSPACE_DIR}"/miniconda3/lib/python3.6/site-packages/torch*
diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
index c598a04e0f97..79e8aedfab75 100644
--- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -36,8 +36,7 @@ popd
 =======
 :: Pin unittest-xml-reporting to freeze printing test summary logic, related: https://github.com/pytorch/pytorch/issues/69014
 
-pip install "ninja==1.10.0.post1" future "hypothesis==5.35.1" "expecttest==0.1.3" "librosa>=0.6.2" "scipy==1.6.3" psutil pillow "unittest-xml-reporting<=3.2.0,>=2.0.0" pytest pytest-xdist pytest-rerunfailures 
-:: # TODO: enable xdoctest later
+pip install "ninja==1.10.0.post1" future "hypothesis==5.35.1" "expecttest==0.1.3" "librosa>=0.6.2" "scipy==1.6.3" psutil pillow "unittest-xml-reporting<=3.2.0,>=2.0.0" pytest pytest-xdist pytest-rerunfailures "xdoctest==1.0.2" "pygments==2.12.0"
 if errorlevel 1 exit /b
 if not errorlevel 0 exit /b
 
diff --git a/pytest.ini b/pytest.ini
index 69185dd94ee9..53b5ad643ebf 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -7,11 +7,6 @@ addopts =
     # capture only Python print and C++ py::print, but not C output (low-level Python errors)
     --capture=sys
     --disable-warnings
-    # TODO: enable xdoctest later
-    #--xdoctest
-    #--xdoctest-style=google
-    #--xdoctest-global-exec="from torch import nn\nimport torch.nn.functional as F\nimport torch"
-    #--xdoctest-options=+IGNORE_WHITESPACE
 testpaths =
     test
 junit_logging_reruns = all
diff --git a/test/run_test.py b/test/run_test.py
old mode 100644
new mode 100755
index 6cd6ebde31f6..a0506b2c9978
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -128,6 +128,10 @@ TESTS = discover_tests(
     ]
 )
 
+# The doctests are a special case that don't correspond to a file that discover
+# tests can enable.
+TESTS = TESTS + ['doctests']
+
 FSDP_TEST = [test for test in TESTS if test.startswith("distributed/fsdp")]
 
 # Tests need to be run with pytest.
@@ -348,20 +352,6 @@ def get_executable_command(options, allow_pytest, disable_coverage=False):
     if options.pytest:
         if allow_pytest:
             executable += ["-m", "pytest"]
-            # Enable xdoctest
-            # TODO: enable xdoctest later
-            # Many doctests assume the existence of these variables
-            # xdoctest_global_exec_lines = r'\n'.join([
-            #     'from torch import nn',
-            #     'import torch.nn.functional as F',
-            #     'import torch',
-            # ])
-            # executable += [
-            #     "--xdoctest",
-            #     "--xdoctest-style=google",
-            #     f"--xdoctest-global-exec='{xdoctest_global_exec_lines}'",
-            #     "--xdoctest-options=+IGNORE_WHITESPACE"
-            # ]
         else:
             print_to_stderr(
                 "Pytest cannot be used for this test. Falling back to unittest."
@@ -565,6 +555,81 @@ def test_distributed(test_module, test_directory, options):
     return 0
 
 
+def run_doctests(test_module, test_directory, options):
+    """
+    Assumes the incoming test module is called doctest, and simply executes the
+    xdoctest runner on the torch library itself.
+    """
+    import xdoctest
+    import pathlib
+    pkgpath = pathlib.Path(torch.__file__).parent
+
+    #
+    enabled = {
+        # TODO: expose these options to the user
+        # Temporary disable all feature-conditional tests
+        # 'lapack': 'auto',
+        # 'cuda': 'auto',
+        # 'cuda1': 'auto',
+        # 'qengine': 'auto',
+        'lapack': 0,
+        'cuda': 0,
+        'cuda1': 0,
+        'qengine': 0,
+    }
+
+    # Resolve "auto" based on a test to determine if the feature is available.
+    if enabled['cuda'] == 'auto' and torch.cuda.is_available():
+        enabled['cuda'] = True
+
+    if enabled['cuda1'] == 'auto' and torch.cuda.is_available() and torch.cuda.device_count() > 1:
+        enabled['cuda1'] = True
+
+    if enabled['lapack'] == 'auto' and torch._C.has_lapack:
+        enabled['lapack'] = True
+
+    if enabled['qengine'] == 'auto':
+        try:
+            # Is there a better check if quantization is enabled?
+            import torch.nn.quantized as nnq  # NOQA
+            torch.backends.quantized.engine = 'qnnpack'
+            torch.backends.quantized.engine = 'fbgemm'
+        except (ImportError, RuntimeError):
+            ...
+        else:
+            enabled['qengine'] = True
+
+    # Set doctest environment variables
+    if enabled['cuda']:
+        os.environ['TORCH_DOCTEST_CUDA'] = '1'
+
+    if enabled['cuda1']:
+        os.environ['TORCH_DOCTEST_CUDA1'] = '1'
+
+    if enabled['lapack']:
+        os.environ['TORCH_DOCTEST_LAPACK'] = '1'
+
+    if enabled['qengine']:
+        os.environ['TORCH_DOCTEST_QENGINE'] = '1'
+
+    pkgpath = os.path.dirname(torch.__file__)
+    xdoctest_config = {
+        'global_exec': r'\n'.join([
+            'from torch import nn',
+            'import torch.nn.functional as F',
+            'import torch',
+        ]),
+        'style': 'google',
+        'options': '+IGNORE_WHITESPACE',
+    }
+    xdoctest_verbose = max(1, options.verbose)
+    run_summary = xdoctest.runner.doctest_module(
+        os.fspath(pkgpath), config=xdoctest_config, verbose=xdoctest_verbose,
+        command=options.xdoctest_command, argv=[])
+    result = 1 if run_summary.get('n_failed', 0) else 0
+    return result
+
+
 CUSTOM_HANDLERS = {
     "test_cuda_primary_ctx": test_cuda_primary_ctx,
     "test_cuda_trace": get_run_test_with_subprocess_fn(),
@@ -583,6 +648,7 @@ CUSTOM_HANDLERS = {
     "distributed/rpc/test_tensorpipe_agent": get_run_test_with_subprocess_fn(),
     "distributed/rpc/test_share_memory": get_run_test_with_subprocess_fn(),
     "distributed/rpc/cuda/test_tensorpipe_agent": get_run_test_with_subprocess_fn(),
+    "doctests": run_doctests,
 }
 
 
@@ -739,6 +805,15 @@ def parse_args():
         action="store_true",
         help="Only list the test that will run.",
     )
+    parser.add_argument(
+        "--xdoctest-command",
+        default='list',
+        help=(
+            "Control the specific doctest action. "
+            "Use 'list' to simply parse doctests and check syntax. "
+            "Use 'all' to execute all doctests or specify a specific "
+            "doctest to run")
+    )
     return parser.parse_args()
 
 
diff --git a/torch/_prims/context.py b/torch/_prims/context.py
index 75143bceaff3..a404d44b6f2d 100644
--- a/torch/_prims/context.py
+++ b/torch/_prims/context.py
@@ -77,7 +77,8 @@ class NvfuserPrimsMode(torch.overrides.TorchFunctionMode):
     Switches the interpretation of torch.ops.prims.* functions to
     use nvFuser's prims in torch.ops.nvprims.*
 
-    >>> with NvfuserPrimMode():
+    >>> # xdoctest: +SKIP("undefined vars")
+    >>> with NvfuserPrimsMode():
     ...     torch.ops.prims.add(x, y)  # calls torch.ops.nvprims.add(x, y)
 
     By default, this context manager will fall back on the torch.ops.prims* if the
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index 493f17637a1b..8ec35ab8f0d3 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -46,12 +46,20 @@ def set_printoptions(
 
     Example::
 
+        >>> # Limit the precision of elements
         >>> torch.set_printoptions(precision=2)
         >>> torch.tensor([1.12345])
         tensor([1.12])
+        >>> # Limit the number of elements shown
         >>> torch.set_printoptions(threshold=5)
         >>> torch.arange(10)
         tensor([0, 1, 2, ..., 7, 8, 9])
+        >>> # Restore defaults
+        >>> torch.set_printoptions(profile='default')
+        >>> torch.tensor([1.12345])
+        tensor([1.1235])
+        >>> torch.arange(10)
+        tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
     """
     if profile is not None:
@@ -206,7 +214,7 @@ def _vector_str(self, indent, summarize, formatter1, formatter2=None):
     elements_per_line = max(
         1, int(math.floor((PRINT_OPTS.linewidth - indent) / (element_length)))
     )
-    char_per_line = element_length * elements_per_line
+    # char_per_line = element_length * elements_per_line  # unused
 
     def _val_formatter(val, formatter1=formatter1, formatter2=formatter2):
         if formatter2 is not None:
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index b699ea67738e..97a6fd29c6bc 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -13523,6 +13523,7 @@ Returns:
 
 Example::
 
+    >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
     >>> g_cpu = torch.Generator()
     >>> g_cuda = torch.Generator(device='cuda')
 """,
diff --git a/torch/autograd/graph.py b/torch/autograd/graph.py
index 022515bf1e97..05c0d51a61cc 100644
--- a/torch/autograd/graph.py
+++ b/torch/autograd/graph.py
@@ -1,6 +1,7 @@
 import torch
 from typing import Callable, Any
 
+
 class saved_tensors_hooks():
     """Context-manager that sets a pair of pack / unpack hooks for saved tensors.
 
@@ -93,7 +94,7 @@ class save_on_cpu(saved_tensors_hooks):
 
     Example::
 
-        >>> # xdoctest: +REQUIRES(env:CUDAHOME)
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
         >>> a = torch.randn(5, requires_grad=True, device="cuda")
         >>> b = torch.randn(5, requires_grad=True, device="cuda")
         >>> c = torch.randn(5, requires_grad=True, device="cuda")
diff --git a/torch/backends/xeon/run_cpu.py b/torch/backends/xeon/run_cpu.py
index c056af964478..69632cb20862 100644
--- a/torch/backends/xeon/run_cpu.py
+++ b/torch/backends/xeon/run_cpu.py
@@ -60,13 +60,13 @@ Single instance inference
 
 ::
 
-   >>> python -m torch.backends.xeon.run_cpu --throughput_mode script.py args
+   python -m torch.backends.xeon.run_cpu --throughput_mode script.py args
 
 2. Run single-instance inference on a single CPU node.
 
 ::
 
-   >>> python -m torch.backends.xeon.run_cpu --node_id 1 script.py args
+   python -m torch.backends.xeon.run_cpu --node_id 1 script.py args
 
 Multi-instance inference
 ------------------------
@@ -77,13 +77,13 @@ Multi-instance inference
 
 ::
 
-   >>> python -m torch.backends.xeon.run_cpu -- python_script args
+   python -m torch.backends.xeon.run_cpu -- python_script args
 
    eg: on an Intel(R) Xeon(R) Scalable Processor with 14 instance, 4 cores per instance
 
 ::
 
-   >>> python -m torch.backends.xeon.run_cpu --ninstances 14 --ncores_per_instance 4 python_script args
+   python -m torch.backends.xeon.run_cpu --ninstances 14 --ncores_per_instance 4 python_script args
 
 2. Run single-instance inference among multiple instances.
    By default, runs all ninstances. If you want to independently run a single instance among ninstances, specify rank.
@@ -92,27 +92,27 @@ Multi-instance inference
 
 ::
 
-   >>> python -m torch.backends.xeon.run_cpu --ninstances 2 --rank 0 python_script args
+   python -m torch.backends.xeon.run_cpu --ninstances 2 --rank 0 python_script args
 
    eg: run 1st instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance (i.e., numactl -C 28-55)
 
 ::
 
-   >>> python -m torch.backends.xeon.run_cpu --ninstances 2 --rank 1 python_script args
+   python -m torch.backends.xeon.run_cpu --ninstances 2 --rank 1 python_script args
 
    eg: run 0th instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance, 2 cores per instance,
    first four cores (i.e., numactl -C 0-1)
 
 ::
 
-   >>> python -m torch.backends.xeon.run_cpu --core_list "0, 1, 2, 3" --ninstances 2 --ncores_per_instance 2
+   python -m torch.backends.xeon.run_cpu --core_list "0, 1, 2, 3" --ninstances 2 --ncores_per_instance 2
    --rank 0 python_script args
 
 3. To look up what optional arguments this module offers:
 
 ::
 
-    >>> python -m torch.backends.xeon.run_cpu --help
+    python -m torch.backends.xeon.run_cpu --help
 
 Memory allocator
 ----------------
diff --git a/torch/csrc/Exceptions.cpp b/torch/csrc/Exceptions.cpp
index 6342826f5daf..5210d6f713db 100644
--- a/torch/csrc/Exceptions.cpp
+++ b/torch/csrc/Exceptions.cpp
@@ -35,7 +35,7 @@ For example, you can the torch.linalg.inv function will raise torch.linalg.LinAl
 a matrix is not invertible.\n \
 \n\
 Example:\n \
->>> # xdoctest: +REQUIRES(--lapac)\n \
+>>> # xdoctest: +REQUIRES(env:TORCH_DOCKTEST_LAPACK)\n \
 >>> matrix = torch.eye(3, 3)\n \
 >>> matrix[-1, -1] = 0\n \
 >>> matrix\n \
diff --git a/torch/distributed/_shard/checkpoint/planner.py b/torch/distributed/_shard/checkpoint/planner.py
index d37de1e76383..f3692cc11395 100644
--- a/torch/distributed/_shard/checkpoint/planner.py
+++ b/torch/distributed/_shard/checkpoint/planner.py
@@ -102,13 +102,15 @@ class SavePlanner(abc.ABC):
     Rewriting state_dict. This is the simplest way to extend the save process as it
     doesn't requite understanding the intrincacies of how SavePlan works:
 
-    >>> class RenamePlanner(DefaultSavePlanner)
+    >>> # xdoctest: +SKIP("undefined vars")
+    >>> class RenamePlanner(DefaultSavePlanner):
     >>>     def init(self, state_dict, is_coordinator):
     >>>         # prefix all keys with `foo_``
-    >>>         super().init(self, {"foo_" + k, v for k, v in state_dict.items()}, is_coordinator)
+    >>>         super().init(self, {"foo_" + k: v for k, v in state_dict.items()}, is_coordinator)
 
     Modifying local plan and lookup in tandem. This is useful when fine control of how data is persisted
 
+    >>> # xdoctest: +SKIP("undefined vars")
     >>> class FP16Planner(DefaultSavePlanner):
     >>>     def create_local_plan(self):
     >>>         plan = super().create_local_plan()
@@ -122,6 +124,7 @@ class SavePlanner(abc.ABC):
 
     Using the global planning step to make central decisions that can't be made individually by each rank
 
+    >>> # xdoctest: +SKIP("undefined vars")
     >>> from itertools import islice
     >>> from dataclasses import replace
     >>> class DDPLoadBalancingPlanner(DefaultSavePlanner):
@@ -141,6 +144,7 @@ class SavePlanner(abc.ABC):
     accomplished by having each rank contribute their data items in the local plan and
     the global planner aggregate them:
 
+    >>> # xdoctest: +SKIP("undefined vars")
     >>> class SaveExtraDataPlanner(DefaultSavePlanner):
     >>>     def create_local_plan(self) -> SavePlan:
     >>>         plan = super().create_local_plan()
@@ -247,18 +251,20 @@ class LoadPlanner:
     to keep a reference to the original state_dict as load happens in place so
     we need to be able to perform it in place
 
-    >>> class RenamePlanner(DefaultLoadPlanner)
+    >>> # xdoctest: +SKIP("undefined vars")
+    >>> class RenamePlanner(DefaultLoadPlanner):
     >>>     def init(self, state_dict, metadata, is_coordinator):
     >>>         self.original_state_dict = state_dict
-    >>>         super().init(self, {"foo_" + k, v for k, v in state_dict.items()}, is_coordinator)
+    >>>         super().init(self, {"foo_" + k: v for k, v in state_dict.items()}, is_coordinator)
     >>>
-    >>>     def load_bytes(self, read_item, value);
+    >>>     def load_bytes(self, read_item, value):
     >>>         # Remove the "foo_" prefix
     >>>         self.original_state_dict[read_item.dest_index.fqn[4:]] = torch.load(value)
 
 
     Modifying resolve_tensor and commit_tensor to handle load time transformation.
 
+    >>> # xdoctest: +SKIP("undefined vars")
     >>> class MetaModelMaterialize(DefaultSavePlanner):
     >>>     def resolve_tensor(self, read_item):
     >>>         tensor = super().resolve_tensor(read_item)
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index 9a8d01c40d70..72a213b1d0f7 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -72,6 +72,7 @@ _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING = (
     "forward",
 )
 
+
 # RPC handler.
 def _instantiate_template(module_interface_cls, enable_moving_cpu_tensors_to_cuda):
     instantiator.instantiate_scriptable_remote_module_template(
@@ -193,13 +194,13 @@ class _RemoteModule(nn.Module):
         Example::
             Run the following code in two different processes:
 
+            >>> # xdoctest: +SKIP("distributed")
             >>> # On worker 0:
             >>> import torch
             >>> import torch.distributed.rpc as rpc
             >>> from torch import nn, Tensor
             >>> from torch.distributed.nn.api.remote_module import RemoteModule
             >>>
-            >>> # xdoctest: +SKIP
             >>> rpc.init_rpc("worker0", rank=0, world_size=2)
             >>> remote_linear_module = RemoteModule(
             >>>     "worker1/cpu", nn.Linear, args=(20, 30),
@@ -500,13 +501,13 @@ class _RemoteModule(nn.Module):
         Example::
             Run the following code in two different processes:
 
+            >>> # xdoctest: +SKIP("distributed")
             >>> # On worker 0:
             >>> import torch
             >>> import torch.distributed.rpc as rpc
             >>> from torch import nn, Tensor
             >>> from torch.distributed.nn.api.remote_module import RemoteModule
             >>>
-            >>> # xdoctest: +SKIP
             >>> rpc.init_rpc("worker0", rank=0, world_size=2)
             >>> remote_module = RemoteModule(
             >>>     "worker1/cpu", nn.Linear, args=(20, 30),
@@ -622,13 +623,13 @@ class RemoteModule(_RemoteModule):
     Example::
         Run the following code in two different processes:
 
+        >>> # xdoctest: +SKIP("distributed")
         >>> # On worker 0:
         >>> import torch
         >>> import torch.distributed.rpc as rpc
         >>> from torch import nn, Tensor
         >>> from torch.distributed.nn.api.remote_module import RemoteModule
         >>>
-        >>> # xdoctest: +SKIP
         >>> rpc.init_rpc("worker0", rank=0, world_size=2)
         >>> remote_linear_module = RemoteModule(
         >>>     "worker1/cpu", nn.Linear, args=(20, 30),
diff --git a/torch/distributed/optim/optimizer.py b/torch/distributed/optim/optimizer.py
index caf5ab293bc4..535104beb9f4 100644
--- a/torch/distributed/optim/optimizer.py
+++ b/torch/distributed/optim/optimizer.py
@@ -18,6 +18,7 @@ __all__ = ['DistributedOptimizer']
 
 logger = logging.getLogger(__name__)
 
+
 # XXX: we define a _ScriptModuleOptimizer here to explicitly
 # compile the FunctionalOptimizer class into TorchScript
 # This is because ScriptClass instance still lives in
@@ -33,6 +34,7 @@ class _ScriptLocalOptimizerInterface(object):
     def step(self, autograd_ctx_id: int) -> None:
         pass
 
+
 class _ScriptLocalOptimizer(nn.Module):
     # TorchScript does not support multithread concurrent compiling.
     # request_callback might invoke concurrent compiling, so we
@@ -106,6 +108,7 @@ def _new_script_local_optimizer(optim_cls, local_params_rref, *args, **kwargs):
         return rpc.RRef(
             script_optim, _ScriptLocalOptimizerInterface)
 
+
 @jit.script
 def _script_local_optimizer_step(
     local_optim_rref: RRef[_ScriptLocalOptimizerInterface],
@@ -114,6 +117,7 @@ def _script_local_optimizer_step(
     local_optim = local_optim_rref.local_value()
     local_optim.step(autograd_ctx_id)
 
+
 def _wait_for_all(rpc_futs):
     # TODO: improve error propagation
     exception = None
@@ -163,12 +167,12 @@ class DistributedOptimizer:
         kwargs: arguments to pass to the optimizer constructor on each worker.
 
     Example::
+        >>> # xdoctest: +SKIP("distributed")
         >>> import torch.distributed.autograd as dist_autograd
         >>> import torch.distributed.rpc as rpc
         >>> from torch import optim
         >>> from torch.distributed.optim import DistributedOptimizer
         >>>
-        >>> # xdoctest: +SKIP
         >>> with dist_autograd.context() as context_id:
         >>>   # Forward pass.
         >>>   rref1 = rpc.remote("worker1", torch.add, args=(torch.ones(2), 3))
diff --git a/torch/distributed/pipeline/sync/pipe.py b/torch/distributed/pipeline/sync/pipe.py
index 81d1a7bc7793..96bc51989f62 100644
--- a/torch/distributed/pipeline/sync/pipe.py
+++ b/torch/distributed/pipeline/sync/pipe.py
@@ -149,14 +149,16 @@ class WithDevice(nn.Module):
         device(:class:`torch.device`): The device to run the module on.
 
     Example::
+        >>> # xdoctest: +SKIP("distributed")
         >>> fc1 = nn.Linear(16, 8).cuda(0)
         >>> fc2 = nn.Linear(8, 4).cuda(1)
         >>> dropout = nn.Dropout()
         >>>
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA1)
         >>> # Dropout does not have any parameters/buffers, but we want to
         >>> # run it on cuda:1 to avoid any GPU to CPU transfers.
         >>> model = nn.Sequential(fc1, fc2, WithDevice(dropout, 'cuda:1'))
-        >>> # xdoctest: +SKIP
+        >>> # xdoctest: +SKIP("Needs RPC framework init")
         >>> model = Pipe(model, chunks=8)
     """
     def __init__(self, module: nn.Module, device: torch.device):
@@ -185,6 +187,7 @@ def _assemble_partition(modules: List[nn.Module]):
             modules_list.append(module)
     return PipeSequential(*modules_list)
 
+
 def _split_module(modules: nn.Sequential) -> Tuple[List[nn.Sequential], List[torch.device]]:
     partitions = []
     devices = []
diff --git a/torch/distributed/rpc/api.py b/torch/distributed/rpc/api.py
index cc2f235a441e..8416fc7c93a3 100644
--- a/torch/distributed/rpc/api.py
+++ b/torch/distributed/rpc/api.py
@@ -148,6 +148,7 @@ def _broadcast_to_followers(sequence_id, objects_map):
 
 _thread_local_var = threading.local()
 
+
 @contextlib.contextmanager
 def _wait_all():
     r"""
@@ -157,10 +158,10 @@ def _wait_all():
 
 
     Example::
+        >>> # xdoctest: +SKIP("distributed")
         >>> # On worker 0:
         >>> import torch
         >>> import torch.distributed.rpc as rpc
-        >>> # xdoctest: +SKIP
         >>> rpc.init_rpc("worker0", rank=0, world_size=2)
         >>> with rpc._wait_all():
         >>>    fut_1 = rpc.rpc_async(dst, torch.add, (torch.ones(2, 2), 1))
@@ -176,6 +177,7 @@ def _wait_all():
         finally:
             del _thread_local_var.future_list
 
+
 @_require_initialized
 def _all_gather(obj, worker_names=None, timeout=UNSET_RPC_TIMEOUT):
     r"""
@@ -285,6 +287,7 @@ def _barrier(worker_names):
             f"Failed to complete barrier, got error {ex}"
         )
 
+
 @_require_initialized
 def _wait_all_workers(timeout=DEFAULT_SHUTDOWN_TIMEOUT):
     r"""
@@ -376,6 +379,7 @@ def shutdown(graceful=True, timeout=DEFAULT_SHUTDOWN_TIMEOUT):
     else:
         _finalize_shutdown()
 
+
 def _finalize_shutdown():
     try:
         # This raises a `TORCH_CHECK()` exception on RRef leak detected.
@@ -396,6 +400,7 @@ def _finalize_shutdown():
         _cleanup_python_rpc_handler()
         _reset_current_rpc_agent()
 
+
 @_require_initialized
 def get_worker_info(worker_name=None):
     r"""
@@ -453,7 +458,6 @@ def _rref_typeof_on_user(rref, timeout=UNSET_RPC_TIMEOUT, blocking=True):
         return fut
 
 
-
 T = TypeVar("T")
 GenericWithOneTypeVar = Generic[T]
 
@@ -669,6 +673,7 @@ def remote(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
 
     return rref
 
+
 def _invoke_rpc(to, func, rpc_type, args=None, kwargs=None, rpc_timeout=UNSET_RPC_TIMEOUT):
     if not callable(func):
         raise TypeError("function should be callable.")
@@ -900,6 +905,7 @@ def rpc_async(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
         _thread_local_var.future_list.append(fut)
     return fut
 
+
 def _get_should_profile():
     # Legacy profiler should be enabled. RPC profiling is not supported with
     # Kineto profiler.
@@ -909,6 +915,7 @@ def _get_should_profile():
         torch._C._autograd._profiler_type() == ActiveProfilerType.LEGACY  # type: ignore[attr-defined]
     )
 
+
 def _enable_rpc_profiler(should_profile, qualified_name, func, rpc_type, dst_worker_info):
     ctx_manager = contextlib.suppress()
 
diff --git a/torch/distributed/rpc/options.py b/torch/distributed/rpc/options.py
index bb67ac032e6d..a995184bc823 100644
--- a/torch/distributed/rpc/options.py
+++ b/torch/distributed/rpc/options.py
@@ -113,6 +113,7 @@ class TensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
                 invertible.
 
         Example::
+            >>> # xdoctest: +SKIP("distributed")
             >>> # both workers
             >>> def add(x, y):
             >>>     print(x)  # tensor([1., 1.], device='cuda:1')
@@ -127,7 +128,6 @@ class TensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
             >>> options.set_device_map("worker1", {1: 2})
             >>> # maps worker0's cuda:1 to worker1's cuda:2
             >>>
-            >>> # xdoctest: +SKIP
             >>> rpc.init_rpc(
             >>>     "worker0",
             >>>     rank=0,
diff --git a/torch/distributions/lowrank_multivariate_normal.py b/torch/distributions/lowrank_multivariate_normal.py
index 5c83dcc9e7de..921477ac99a4 100644
--- a/torch/distributions/lowrank_multivariate_normal.py
+++ b/torch/distributions/lowrank_multivariate_normal.py
@@ -8,6 +8,7 @@ from torch.distributions.utils import _standard_normal, lazy_property
 
 __all__ = ['LowRankMultivariateNormal']
 
+
 def _batch_capacitance_tril(W, D):
     r"""
     Computes Cholesky of :math:`I + W.T @ inv(D) @ W` for a batch of matrices :math:`W`
@@ -52,7 +53,8 @@ class LowRankMultivariateNormal(Distribution):
         covariance_matrix = cov_factor @ cov_factor.T + cov_diag
 
     Example:
-        >>> # xdoctest: +REQUIRES(--lapack)
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> # xdoctest: +IGNORE_WANT("non-determenistic")
         >>> m = LowRankMultivariateNormal(torch.zeros(2), torch.tensor([[1.], [0.]]), torch.ones(2))
         >>> m.sample()  # normally distributed with mean=`[0,0]`, cov_factor=`[[1],[0]]`, cov_diag=`[1,1]`
         tensor([-0.2102, -0.5429])
diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py
index 55a5dd3a228a..e8c15c32d985 100644
--- a/torch/distributions/multivariate_normal.py
+++ b/torch/distributions/multivariate_normal.py
@@ -7,6 +7,7 @@ from torch.distributions.utils import _standard_normal, lazy_property
 
 __all__ = ['MultivariateNormal']
 
+
 def _batch_mv(bmat, bvec):
     r"""
     Performs a batched matrix-vector product, with compatible but different batch shapes.
@@ -91,7 +92,8 @@ class MultivariateNormal(Distribution):
 
     Example:
 
-        >>> # xdoctest: +REQUIRES(--lapack)
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> # xdoctest: +IGNORE_WANT("non-determenistic")
         >>> m = MultivariateNormal(torch.zeros(2), torch.eye(2))
         >>> m.sample()  # normally distributed with mean=`[0,0]` and covariance_matrix=`I`
         tensor([-0.2102, -0.5429])
diff --git a/torch/functional.py b/torch/functional.py
index abe3fef5eade..12c6f1b143e1 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -136,7 +136,6 @@ def broadcast_shapes(*shapes):
             return tensors[0].shape
 
 
-
 def split(
     tensor: Tensor, split_size_or_sections: Union[int, List[int]], dim: int = 0
 ) -> List[Tensor]:
@@ -451,6 +450,7 @@ else:
 
             `torch.meshgrid` is commonly used to produce a grid for
             plotting.
+            >>> # xdoctest: +REQUIRES(module:matplotlib)
             >>> import matplotlib.pyplot as plt
             >>> xs = torch.linspace(-5, 5, steps=100)
             >>> ys = torch.linspace(-5, 5, steps=100)
@@ -458,8 +458,6 @@ else:
             >>> z = torch.sin(torch.sqrt(x * x + y * y))
             >>> ax = plt.axes(projection='3d')
             >>> ax.plot_surface(x.numpy(), y.numpy(), z.numpy())
-            >>> # xdoctest: +SKIP
-            <mpl_toolkits.mplot3d.art3d.Poly3DCollection object at 0x7f8f30d40100>
             >>> plt.show()
 
         .. image:: ../_static/img/meshgrid.png
@@ -736,23 +734,22 @@ def _unique_impl(input: Tensor, sorted: bool = True,
 
         >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long))
         >>> output
-        >>> # xdoctest: +SKIP
-        tensor([ 2,  3,  1])
+        tensor([1, 2, 3])
 
         >>> output, inverse_indices = torch.unique(
         ...     torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True)
         >>> output
-        tensor([ 1,  2,  3])
+        tensor([1, 2, 3])
         >>> inverse_indices
-        tensor([ 0,  2,  1,  2])
+        tensor([0, 2, 1, 2])
 
         >>> output, inverse_indices = torch.unique(
         ...     torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True)
         >>> output
-        tensor([ 1,  2,  3])
+        tensor([1, 2, 3])
         >>> inverse_indices
-        tensor([[ 0,  2],
-                [ 1,  2]])
+        tensor([[0, 2],
+                [1, 2]])
 
     """
     if has_torch_function_unary(input):
@@ -983,6 +980,7 @@ else:
     def tensordot(a, b, dims: torch.Tensor, out: Optional[torch.Tensor] = None):  # noqa: F811
         pass
 
+
 def tensordot(a, b, dims=2, out: Optional[torch.Tensor] = None):  # noqa: F811
     r"""Returns a contraction of a and b over multiple dimensions.
 
@@ -1019,9 +1017,9 @@ def tensordot(a, b, dims=2, out: Optional[torch.Tensor] = None):  # noqa: F811
                 [4796., 5162.],
                 [4928., 5306.]])
 
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
         >>> a = torch.randn(3, 4, 5, device='cuda')
         >>> b = torch.randn(4, 5, 6, device='cuda')
-        >>> # xdoctest: +SKIP
         >>> c = torch.tensordot(a, b, dims=2).cpu()
         tensor([[ 8.3504, -2.5436,  6.2922,  2.7556, -1.0732,  3.2741],
                 [ 3.3161,  0.0704,  5.0187, -0.4079, -4.3126,  4.8744],
@@ -1073,6 +1071,7 @@ def tensordot(a, b, dims=2, out: Optional[torch.Tensor] = None):  # noqa: F811
     else:
         return _VF.tensordot(a, b, dims_a, dims_b, out=out)  # type: ignore[attr-defined]
 
+
 def cartesian_prod(*tensors: Tensor) -> Tensor:
     """Do cartesian product of the given sequence of tensors. The behavior is similar to
     python's `itertools.product`.
@@ -1087,9 +1086,9 @@ def cartesian_prod(*tensors: Tensor) -> Tensor:
 
     Example::
 
+        >>> import itertools
         >>> a = [1, 2, 3]
         >>> b = [4, 5]
-        >>> # xdoctest: +SKIP
         >>> list(itertools.product(a, b))
         [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)]
         >>> tensor_a = torch.tensor(a)
@@ -1107,6 +1106,7 @@ def cartesian_prod(*tensors: Tensor) -> Tensor:
         return handle_torch_function(cartesian_prod, tensors, *tensors)
     return _VF.cartesian_prod(tensors)  # type: ignore[attr-defined]
 
+
 def block_diag(*tensors):
     """Create a block diagonal matrix from provided tensors.
 
@@ -1197,6 +1197,7 @@ def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessary'):
     else:
         raise ValueError(f"{compute_mode} is not a valid value for compute_mode")
 
+
 def atleast_1d(*tensors):
     r"""
     Returns a 1-dimensional view of each input tensor with zero dimensions.
@@ -1210,12 +1211,11 @@ def atleast_1d(*tensors):
 
     Example::
 
-        >>> x = torch.randn(2)
+        >>> x = torch.arange(2)
         >>> x
-        >>> # xdoctest: +SKIP
-        tensor([1.4584, 0.7583])
+        tensor([0, 1])
         >>> torch.atleast_1d(x)
-        tensor([1.4584, 0.7583])
+        tensor([0, 1])
         >>> x = torch.tensor(1.)
         >>> x
         tensor(1.)
@@ -1233,6 +1233,7 @@ def atleast_1d(*tensors):
         tensors = tensors[0]
     return _VF.atleast_1d(tensors)  # type: ignore[attr-defined]
 
+
 def atleast_2d(*tensors):
     r"""
     Returns a 2-dimensional view of each input tensor with zero dimensions.
@@ -1251,14 +1252,13 @@ def atleast_2d(*tensors):
         tensor(1.)
         >>> torch.atleast_2d(x)
         tensor([[1.]])
-        >>> x = torch.randn(2,2)
+        >>> x = torch.arange(4).view(2,2)
         >>> x
-        >>> # xdoctest: +SKIP
-        tensor([[2.2086, 2.5165],
-                [0.1757, 0.5194]])
+        tensor([[0, 1],
+                [2, 3]])
         >>> torch.atleast_2d(x)
-        tensor([[2.2086, 2.5165],
-                [0.1757, 0.5194]])
+        tensor([[0, 1],
+                [2, 3]])
         >>> x = torch.tensor(0.5)
         >>> y = torch.tensor(1.)
         >>> torch.atleast_2d((x,y))
@@ -1271,6 +1271,7 @@ def atleast_2d(*tensors):
         tensors = tensors[0]
     return _VF.atleast_2d(tensors)  # type: ignore[attr-defined]
 
+
 def atleast_3d(*tensors):
     r"""
     Returns a 3-dimensional view of each input tensor with zero dimensions.
@@ -1289,22 +1290,21 @@ def atleast_3d(*tensors):
         tensor(0.5000)
         >>> torch.atleast_3d(x)
         tensor([[[0.5000]]])
-        >>> y = torch.randn(2,2)
+        >>> y = torch.arange(4).view(2,2)
         >>> y
-        >>> # xdoctest: +SKIP
-        tensor([[-0.8079,  0.7460],
-                [-1.1647,  1.4734]])
+        tensor([[0, 1],
+                [2, 3]])
         >>> torch.atleast_3d(y)
-        tensor([[[-0.8079],
-                [ 0.7460]],
+        tensor([[[0],
+                 [1]],
                 <BLANKLINE>
-                [[-1.1647],
-                [ 1.4734]]])
-        >>> x = torch.randn(1,1,1)
+                [[2],
+                 [3]]])
+        >>> x = torch.tensor(1).view(1, 1, 1)
         >>> x
-        tensor([[[-1.5689]]])
+        tensor([[[1]]])
         >>> torch.atleast_3d(x)
-        tensor([[[-1.5689]]])
+        tensor([[[1]]])
         >>> x = torch.tensor(0.5)
         >>> y = torch.tensor(1.)
         >>> torch.atleast_3d((x,y))
@@ -1426,7 +1426,6 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa
         >>> a = torch.arange(9, dtype= torch.float) - 4
         >>> b = a.reshape((3, 3))
         >>> torch.norm(a)
-        >>> # xdoctest: +SKIP
         tensor(7.7460)
         >>> torch.norm(b)
         tensor(7.7460)
@@ -1514,6 +1513,7 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa
             else:
                 return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype, out=out)  # type: ignore[attr-defined]
 
+
 def chain_matmul(*matrices, out=None):
     r"""Returns the matrix product of the :math:`N` 2-D tensors. This product is efficiently computed
     using the matrix chain order algorithm which selects the order in which incurs the lowest cost in terms
@@ -1537,12 +1537,13 @@ def chain_matmul(*matrices, out=None):
 
     Example::
 
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
         >>> a = torch.randn(3, 4)
         >>> b = torch.randn(4, 5)
         >>> c = torch.randn(5, 6)
         >>> d = torch.randn(6, 7)
+        >>> # will raise a deprecation warning
         >>> torch.chain_matmul(a, b, c, d)
-        >>> # xdoctest: +SKIP
         tensor([[ -2.3375,  -3.9790,  -4.1119,  -6.6577,   9.5609, -11.5095,  -3.2614],
                 [ 21.4038,   3.3378,  -8.4982,  -5.2457, -10.2561,  -2.4684,   2.7163],
                 [ -0.9647,  -5.8917,  -2.3213,  -5.2284,  12.8615, -12.2816,  -2.5095]])
@@ -1635,7 +1636,8 @@ def _lu_impl(A, pivot=True, get_infos=False, out=None):
 
     Example::
 
-        >>> # xdoctest: +REQUIRES(--lapack)
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> # xdoctest: +IGNORE_WANT("non-determenistic")
         >>> A = torch.randn(2, 3, 3)
         >>> A_LU, pivots = torch.lu(A)
         >>> A_LU
@@ -1662,6 +1664,7 @@ if TYPE_CHECKING:
 else:
     _ListOrSeq = List[Tensor]
 
+
 def _check_list_size(out_len: int, get_infos: bool, out: _ListOrSeq) -> None:
     get_infos_int = 1 if get_infos else 0
     if out_len - get_infos_int != 2:
@@ -1669,6 +1672,7 @@ def _check_list_size(out_len: int, get_infos: bool, out: _ListOrSeq) -> None:
     if not isinstance(out, (tuple, list)):
         raise TypeError(f"argument 'out' must be tuple of Tensors, not {type(out).__name__}")
 
+
 def _lu_with_infos(A, pivot=True, get_infos=False, out=None):
     # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor, Tensor]]) -> Tuple[Tensor, Tensor, Tensor]
     if has_torch_function_unary(A):
@@ -1683,6 +1687,7 @@ def _lu_with_infos(A, pivot=True, get_infos=False, out=None):
     else:
         return result  # A_LU, pivots, infos
 
+
 def _lu_no_infos(A, pivot=True, get_infos=False, out=None):
     # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor]
     # need to check for torch_function here so that we exit if
@@ -1710,5 +1715,6 @@ lu = boolean_dispatch(
     func_name='lu')
 lu.__doc__ = _lu_impl.__doc__
 
+
 def align_tensors(*tensors):
     raise RuntimeError('`align_tensors` not yet implemented.')
diff --git a/torch/futures/__init__.py b/torch/futures/__init__.py
index 1795983b3f30..f2ba35f1e80b 100644
--- a/torch/futures/__init__.py
+++ b/torch/futures/__init__.py
@@ -9,9 +9,11 @@ __all__ = ['Future', 'collect_all', 'wait_all']
 T = TypeVar("T")
 S = TypeVar("S")
 
+
 class _PyFutureMeta(type(torch._C.Future), type(Generic)):  # type: ignore[misc, no-redef]
     pass
 
+
 class Future(torch._C.Future, Generic[T], metaclass=_PyFutureMeta):
     r"""
     Wrapper around a ``torch._C.Future`` which encapsulates an asynchronous
diff --git a/torch/hub.py b/torch/hub.py
index cc27b15930bb..66532d928dd1 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -170,7 +170,6 @@ def _validate_not_a_forked_repo(repo_owner, repo_name, ref):
                      'If it\'s a commit from a forked repo, please call hub.load() with forked repo directly.')
 
 
-
 def _get_cache_or_reload(github, force_reload, trust_repo, calling_fn, verbose=True, skip_validation=False):
     # Setup hub_dir to save downloaded files
     hub_dir = get_dir()
@@ -240,6 +239,7 @@ def _get_cache_or_reload(github, force_reload, trust_repo, calling_fn, verbose=T
 
     return repo_dir
 
+
 def _check_repo_is_trusted(repo_owner, repo_name, owner_name_branch, trust_repo, calling_fn="load"):
     hub_dir = get_dir()
     filepath = os.path.join(hub_dir, "trusted_list")
@@ -522,11 +522,11 @@ def load(repo_or_dir, model, *args, source='github', trust_repo=None, force_relo
     Example:
         >>> # from a github repo
         >>> repo = 'pytorch/vision'
-        >>> model = torch.hub.load(repo, 'resnet50', pretrained=True)
+        >>> model = torch.hub.load(repo, 'resnet50', weights='ResNet50_Weights.IMAGENET1K_V1')
         >>> # from a local directory
         >>> path = '/some/local/path/pytorch/vision'
         >>> # xdoctest: +SKIP
-        >>> model = torch.hub.load(path, 'resnet50', pretrained=True)
+        >>> model = torch.hub.load(path, 'resnet50', weights='ResNet50_Weights.DEFAULT')
     """
     source = source.lower()
 
@@ -558,9 +558,9 @@ def _load_local(hubconf_dir, model, *args, **kwargs):
         a single model with corresponding pretrained weights.
 
     Example:
+        >>> # xdoctest: +SKIP("stub local path")
         >>> path = '/some/local/path/pytorch/vision'
-        >>> # xdoctest: +SKIP
-        >>> model = _load_local(path, 'resnet50', pretrained=True)
+        >>> model = _load_local(path, 'resnet50', weights='ResNet50_Weights.IMAGENET1K_V1')
     """
     sys.path.insert(0, hubconf_dir)
 
@@ -587,6 +587,7 @@ def download_url_to_file(url, dst, hash_prefix=None, progress=True):
             Default: True
 
     Example:
+        >>> # xdoctest: +REQUIRES(POSIX)
         >>> torch.hub.download_url_to_file('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth', '/tmp/temporary_file')
 
     """
diff --git a/torch/masked/maskedtensor/core.py b/torch/masked/maskedtensor/core.py
index 851587c5f919..277aa396f625 100644
--- a/torch/masked/maskedtensor/core.py
+++ b/torch/masked/maskedtensor/core.py
@@ -21,6 +21,8 @@ def is_masked_tensor(a):
 
     Examples:
 
+        >>> # xdoctest: +SKIP
+        >>> from torch.masked.maskedtensor.creation import masked_tensor
         >>> data = torch.arange(6).reshape(2,3)
         >>> mask = torch.tensor([[True, False, False], [True, True, False]])
         >>> mt = masked_tensor(data, mask)
diff --git a/torch/masked/maskedtensor/creation.py b/torch/masked/maskedtensor/creation.py
index 7919e2df863e..15acc20efcd9 100644
--- a/torch/masked/maskedtensor/creation.py
+++ b/torch/masked/maskedtensor/creation.py
@@ -19,6 +19,7 @@ def masked_tensor(data, mask, requires_grad=False):
 
     Examples::
 
+        >>> # xdoctest: +SKIP
         >>> data = torch.arange(6).reshape(2,3)
         >>> mask = torch.tensor([[True, False, False], [True, True, False]])
         >>> mt = masked_tensor(data, mask)
diff --git a/torch/monitor/__init__.py b/torch/monitor/__init__.py
index 723936c8382a..b8589bb00087 100644
--- a/torch/monitor/__init__.py
+++ b/torch/monitor/__init__.py
@@ -16,6 +16,7 @@ class TensorboardEventHandler:
     This currently only supports ``torch.monitor.Stat`` events which are logged
     as scalars.
 
+    >>> # xdoctest: +REQUIRES(module:tensorboard)
     >>> from torch.utils.tensorboard import SummaryWriter
     >>> from torch.monitor import TensorboardEventHandler, register_event_handler
     >>> writer = SummaryWriter("log_dir")
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 640428de0bb6..496d2b1b5ee6 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -2677,7 +2677,7 @@ def nll_loss(
         >>> input = torch.randn(3, 5, requires_grad=True)
         >>> # each element in target has to have 0 <= value < C
         >>> target = torch.tensor([1, 0, 4])
-        >>> output = F.nll_loss(F.log_softmax(input), target)
+        >>> output = F.nll_loss(F.log_softmax(input, dim=1), target)
         >>> output.backward()
     """
     if has_torch_function_variadic(input, target, weight):
diff --git a/torch/nn/init.py b/torch/nn/init.py
index 6ea582d6189b..b70e7f5e390c 100644
--- a/torch/nn/init.py
+++ b/torch/nn/init.py
@@ -463,7 +463,7 @@ def orthogonal_(tensor, gain=1):
         gain: optional scaling factor
 
     Examples:
-        >>> # xdoctest: +REQUIRES(--lapack)
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
         >>> w = torch.empty(3, 5)
         >>> nn.init.orthogonal_(w)
     """
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 88c3322879b7..c816437abdbf 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -1323,7 +1323,7 @@ class Softmin(Module):
 
     Examples::
 
-        >>> m = nn.Softmin()
+        >>> m = nn.Softmin(dim=1)
         >>> input = torch.randn(2, 3)
         >>> output = m(input)
     """
@@ -1450,7 +1450,7 @@ class LogSoftmax(Module):
 
     Examples::
 
-        >>> m = nn.LogSoftmax()
+        >>> m = nn.LogSoftmax(dim=1)
         >>> input = torch.randn(2, 3)
         >>> output = m(input)
     """
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
index 094e91b2e695..382accfef560 100644
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@@ -13,6 +13,7 @@ from .module import Module
 __all__ = ['BatchNorm1d', 'LazyBatchNorm1d', 'BatchNorm2d', 'LazyBatchNorm2d', 'BatchNorm3d',
            'LazyBatchNorm3d', 'SyncBatchNorm']
 
+
 class _NormBase(Module):
     """Common base of _InstanceNorm and _BatchNorm"""
 
@@ -779,6 +780,7 @@ class SyncBatchNorm(_BatchNorm):
         Example::
 
             >>> # Network with nn.BatchNorm layer
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
             >>> module = torch.nn.Sequential(
             >>>            torch.nn.Linear(20, 100),
             >>>            torch.nn.BatchNorm1d(100),
@@ -790,7 +792,7 @@ class SyncBatchNorm(_BatchNorm):
             >>> # Note: every rank calls into new_group for every
             >>> # process group created, even if that rank is not
             >>> # part of the group.
-            >>> # xdoctest: +SKIP
+            >>> # xdoctest: +SKIP("distributed")
             >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
             >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
             >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 1d466fbf2c62..85de8c549edb 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -449,15 +449,16 @@ class KLDivLoss(_Loss):
 
     Examples::
 
+        >>> import torch.nn.functional as F
         >>> kl_loss = nn.KLDivLoss(reduction="batchmean")
         >>> # input should be a distribution in the log space
-        >>> input = F.log_softmax(torch.randn(3, 5, requires_grad=True))
+        >>> input = F.log_softmax(torch.randn(3, 5, requires_grad=True), dim=1)
         >>> # Sample a batch of distributions. Usually this would come from the dataset
-        >>> target = F.softmax(torch.rand(3, 5))
+        >>> target = F.softmax(torch.rand(3, 5), dim=1)
         >>> output = kl_loss(input, target)
 
         >>> kl_loss = nn.KLDivLoss(reduction="batchmean", log_target=True)
-        >>> log_target = F.log_softmax(torch.rand(3, 5))
+        >>> log_target = F.log_softmax(torch.rand(3, 5), dim=1)
         >>> output = kl_loss(input, log_target)
     """
     __constants__ = ['reduction']
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 131b45f73938..50f82d50cf82 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -21,6 +21,7 @@ _grad_t = Union[Tuple[Tensor, ...], Tensor]
 # the type of the subclass, not the looser type of `Module`.
 T = TypeVar('T', bound='Module')
 
+
 class _IncompatibleKeys(namedtuple('IncompatibleKeys', ['missing_keys', 'unexpected_keys'])):
     def __repr__(self):
         if not self.missing_keys and not self.unexpected_keys:
@@ -41,6 +42,7 @@ def _addindent(s_, numSpaces):
     s = first + '\n' + s
     return s
 
+
 class _WrappedHook:
     def __init__(self, hook: Callable, module: Optional["Module"] = None):
         self.hook: Callable = hook
@@ -151,6 +153,7 @@ def register_module_forward_hook(hook: Callable[..., None]) -> RemovableHandle:
     _global_forward_hooks[handle.id] = hook
     return handle
 
+
 def register_module_backward_hook(
     hook: Callable[['Module', _grad_t, _grad_t], Union[None, Tensor]]
 ) -> RemovableHandle:
@@ -177,6 +180,7 @@ def register_module_backward_hook(
     _global_backward_hooks[handle.id] = hook
     return handle
 
+
 def register_module_full_backward_hook(
     hook: Callable[['Module', _grad_t, _grad_t], Union[None, Tensor]]
 ) -> RemovableHandle:
@@ -933,6 +937,7 @@ class Module:
             Parameter containing:
             tensor([[ 0.1913, -0.3420],
                     [-0.5113, -0.2325]], dtype=torch.float64)
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA1)
             >>> gpu1 = torch.device("cuda:1")
             >>> linear.to(gpu1, dtype=torch.half, non_blocking=True)
             Linear(in_features=2, out_features=2, bias=True)
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index eb45e20db56d..93a43e10c962 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -525,7 +525,7 @@ class AvgPool1d(_AvgPoolNd):
         >>> # pool with window of size=3, stride=2
         >>> m = nn.AvgPool1d(3, stride=2)
         >>> m(torch.tensor([[[1.,2,3,4,5,6,7]]]))
-        tensor([[[ 2.,  4.,  6.]]])
+        tensor([[[2., 4., 6.]]])
     """
 
     kernel_size: _size_1_t
diff --git a/torch/nn/modules/upsampling.py b/torch/nn/modules/upsampling.py
index 56711f295414..4f13c84c2e90 100644
--- a/torch/nn/modules/upsampling.py
+++ b/torch/nn/modules/upsampling.py
@@ -7,6 +7,7 @@ from ..common_types import _size_2_t, _ratio_2_t, _size_any_t, _ratio_any_t
 
 __all__ = ['Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d']
 
+
 class Upsample(Module):
     r"""Upsamples a given multi-channel 1D (temporal), 2D (spatial) or 3D (volumetric) data.
 
@@ -73,62 +74,61 @@ class Upsample(Module):
 
         >>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2)
         >>> input
-        tensor([[[[ 1.,  2.],
-                  [ 3.,  4.]]]])
+        tensor([[[[1., 2.],
+                  [3., 4.]]]])
 
         >>> m = nn.Upsample(scale_factor=2, mode='nearest')
         >>> m(input)
-        tensor([[[[ 1.,  1.,  2.,  2.],
-                  [ 1.,  1.,  2.,  2.],
-                  [ 3.,  3.,  4.,  4.],
-                  [ 3.,  3.,  4.,  4.]]]])
+        tensor([[[[1., 1., 2., 2.],
+                  [1., 1., 2., 2.],
+                  [3., 3., 4., 4.],
+                  [3., 3., 4., 4.]]]])
 
         >>> # xdoctest: +IGNORE_WANT("other tests seem to modify printing styles")
         >>> m = nn.Upsample(scale_factor=2, mode='bilinear')  # align_corners=False
         >>> m(input)
-        tensor([[[[ 1.0000,  1.2500,  1.7500,  2.0000],
-                  [ 1.5000,  1.7500,  2.2500,  2.5000],
-                  [ 2.5000,  2.7500,  3.2500,  3.5000],
-                  [ 3.0000,  3.2500,  3.7500,  4.0000]]]])
+        tensor([[[[1.0000, 1.2500, 1.7500, 2.0000],
+                  [1.5000, 1.7500, 2.2500, 2.5000],
+                  [2.5000, 2.7500, 3.2500, 3.5000],
+                  [3.0000, 3.2500, 3.7500, 4.0000]]]])
 
         >>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
         >>> m(input)
-        tensor([[[[ 1.0000,  1.3333,  1.6667,  2.0000],
-                  [ 1.6667,  2.0000,  2.3333,  2.6667],
-                  [ 2.3333,  2.6667,  3.0000,  3.3333],
-                  [ 3.0000,  3.3333,  3.6667,  4.0000]]]])
+        tensor([[[[1.0000, 1.3333, 1.6667, 2.0000],
+                  [1.6667, 2.0000, 2.3333, 2.6667],
+                  [2.3333, 2.6667, 3.0000, 3.3333],
+                  [3.0000, 3.3333, 3.6667, 4.0000]]]])
 
         >>> # Try scaling the same data in a larger tensor
-        >>>
         >>> input_3x3 = torch.zeros(3, 3).view(1, 1, 3, 3)
         >>> input_3x3[:, :, :2, :2].copy_(input)
-        tensor([[[[ 1.,  2.],
-                  [ 3.,  4.]]]])
+        tensor([[[[1., 2.],
+                  [3., 4.]]]])
         >>> input_3x3
-        tensor([[[[ 1.,  2.,  0.],
-                  [ 3.,  4.,  0.],
-                  [ 0.,  0.,  0.]]]])
+        tensor([[[[1., 2., 0.],
+                  [3., 4., 0.],
+                  [0., 0., 0.]]]])
 
         >>> # xdoctest: +IGNORE_WANT("seems to fail when other tests are run in the same session")
         >>> m = nn.Upsample(scale_factor=2, mode='bilinear')  # align_corners=False
         >>> # Notice that values in top left corner are the same with the small input (except at boundary)
         >>> m(input_3x3)
-        tensor([[[[ 1.0000,  1.2500,  1.7500,  1.5000,  0.5000,  0.0000],
-                  [ 1.5000,  1.7500,  2.2500,  1.8750,  0.6250,  0.0000],
-                  [ 2.5000,  2.7500,  3.2500,  2.6250,  0.8750,  0.0000],
-                  [ 2.2500,  2.4375,  2.8125,  2.2500,  0.7500,  0.0000],
-                  [ 0.7500,  0.8125,  0.9375,  0.7500,  0.2500,  0.0000],
-                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]]])
+        tensor([[[[1.0000, 1.2500, 1.7500, 1.5000, 0.5000, 0.0000],
+                  [1.5000, 1.7500, 2.2500, 1.8750, 0.6250, 0.0000],
+                  [2.5000, 2.7500, 3.2500, 2.6250, 0.8750, 0.0000],
+                  [2.2500, 2.4375, 2.8125, 2.2500, 0.7500, 0.0000],
+                  [0.7500, 0.8125, 0.9375, 0.7500, 0.2500, 0.0000],
+                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]])
 
         >>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
         >>> # Notice that values in top left corner are now changed
         >>> m(input_3x3)
-        tensor([[[[ 1.0000,  1.4000,  1.8000,  1.6000,  0.8000,  0.0000],
-                  [ 1.8000,  2.2000,  2.6000,  2.2400,  1.1200,  0.0000],
-                  [ 2.6000,  3.0000,  3.4000,  2.8800,  1.4400,  0.0000],
-                  [ 2.4000,  2.7200,  3.0400,  2.5600,  1.2800,  0.0000],
-                  [ 1.2000,  1.3600,  1.5200,  1.2800,  0.6400,  0.0000],
-                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]]])
+        tensor([[[[1.0000, 1.4000, 1.8000, 1.6000, 0.8000, 0.0000],
+                  [1.8000, 2.2000, 2.6000, 2.2400, 1.1200, 0.0000],
+                  [2.6000, 3.0000, 3.4000, 2.8800, 1.4400, 0.0000],
+                  [2.4000, 2.7200, 3.0400, 2.5600, 1.2800, 0.0000],
+                  [1.2000, 1.3600, 1.5200, 1.2800, 0.6400, 0.0000],
+                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]])
     """
     __constants__ = ['size', 'scale_factor', 'mode', 'align_corners', 'name', 'recompute_scale_factor']
     name: str
@@ -196,15 +196,15 @@ class UpsamplingNearest2d(Upsample):
 
         >>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2)
         >>> input
-        tensor([[[[ 1.,  2.],
-                  [ 3.,  4.]]]])
+        tensor([[[[1., 2.],
+                  [3., 4.]]]])
 
         >>> m = nn.UpsamplingNearest2d(scale_factor=2)
         >>> m(input)
-        tensor([[[[ 1.,  1.,  2.,  2.],
-                  [ 1.,  1.,  2.,  2.],
-                  [ 3.,  3.,  4.,  4.],
-                  [ 3.,  3.,  4.,  4.]]]])
+        tensor([[[[1., 1., 2., 2.],
+                  [1., 1., 2., 2.],
+                  [3., 3., 4., 4.],
+                  [3., 3., 4., 4.]]]])
     """
     def __init__(self, size: Optional[_size_2_t] = None, scale_factor: Optional[_ratio_2_t] = None) -> None:
         super(UpsamplingNearest2d, self).__init__(size, scale_factor, mode='nearest')
@@ -242,16 +242,16 @@ class UpsamplingBilinear2d(Upsample):
 
         >>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2)
         >>> input
-        tensor([[[[ 1.,  2.],
-                  [ 3.,  4.]]]])
+        tensor([[[[1., 2.],
+                  [3., 4.]]]])
 
         >>> # xdoctest: +IGNORE_WANT("do other tests modify the global state?")
         >>> m = nn.UpsamplingBilinear2d(scale_factor=2)
         >>> m(input)
-        tensor([[[[ 1.0000,  1.3333,  1.6667,  2.0000],
-                  [ 1.6667,  2.0000,  2.3333,  2.6667],
-                  [ 2.3333,  2.6667,  3.0000,  3.3333],
-                  [ 3.0000,  3.3333,  3.6667,  4.0000]]]])
+        tensor([[[[1.0000, 1.3333, 1.6667, 2.0000],
+                  [1.6667, 2.0000, 2.3333, 2.6667],
+                  [2.3333, 2.6667, 3.0000, 3.3333],
+                  [3.0000, 3.3333, 3.6667, 4.0000]]]])
     """
     def __init__(self, size: Optional[_size_2_t] = None, scale_factor: Optional[_ratio_2_t] = None) -> None:
         super(UpsamplingBilinear2d, self).__init__(size, scale_factor, mode='bilinear', align_corners=True)
diff --git a/torch/nn/quantized/modules/conv.py b/torch/nn/quantized/modules/conv.py
index 7c726f7b114f..46acab481140 100644
--- a/torch/nn/quantized/modules/conv.py
+++ b/torch/nn/quantized/modules/conv.py
@@ -31,6 +31,7 @@ def _reverse_repeat_padding(padding: List[int]) -> List[int]:
             _reversed_padding_repeated_twice.append(padding[N - idx - 1])
     return _reversed_padding_repeated_twice
 
+
 class _ConvNd(WeightedQuantizedModule):
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
@@ -266,6 +267,7 @@ class _ConvNd(WeightedQuantizedModule):
         qconv.zero_point = int(output_zero_point)
         return qconv
 
+
 class Conv1d(_ConvNd):
     r"""Applies a 1D convolution over a quantized input signal composed of
     several quantized input planes.
@@ -295,7 +297,7 @@ class Conv1d(_ConvNd):
         >>> # quantize input to quint8
         >>> # xdoctest: +SKIP
         >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0,
-                                                dtype=torch.quint8)
+        ...                                     dtype=torch.quint8)
         >>> output = m(q_input)
 
     """
@@ -572,6 +574,7 @@ class Conv3d(_ConvNd):
 # === Transposed Convolutions ===
 MOD = TypeVar('MOD', bound=nn.modules.conv._ConvNd)
 
+
 class _ConvTransposeNd(_ConvNd):
 
     _FLOAT_MODULE = MOD
@@ -655,6 +658,7 @@ class _ConvTransposeNd(_ConvNd):
         qconv.zero_point = int(output_zero_point)
         return qconv
 
+
 class ConvTranspose1d(_ConvTransposeNd):
     r"""Applies a 1D transposed convolution operator over an input image
     composed of several input planes.
@@ -675,9 +679,10 @@ class ConvTranspose1d(_ConvTransposeNd):
 
     Examples::
 
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
         >>> torch.backends.quantized.engine = 'qnnpack'
+        >>> from torch.nn import quantized as nnq
         >>> # With square kernels and equal stride
-        >>> # xdoctest: +SKIP
         >>> m = nnq.ConvTranspose1d(16, 33, 3, stride=2)
         >>> # non-square kernels and unequal stride and with padding
         >>> m = nnq.ConvTranspose1d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
@@ -692,6 +697,7 @@ class ConvTranspose1d(_ConvTransposeNd):
         >>> h = downsample(q_input)
         >>> h.size()
         torch.Size([1, 16, 6])
+        >>> # xdoctest: +SKIP("FIXME: output_size is not a parameter)
         >>> output = upsample(h, output_size=input.size())
         >>> output.size()
         torch.Size([1, 16, 12])
@@ -763,10 +769,11 @@ class ConvTranspose2d(_ConvTransposeNd):
 
     Examples::
 
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
         >>> # QNNPACK or FBGEMM as backend
         >>> torch.backends.quantized.engine = 'qnnpack'
         >>> # With square kernels and equal stride
-        >>> # xdoctest: +SKIP
+        >>> import torch.nn.quantized as nnq
         >>> m = nnq.ConvTranspose2d(16, 33, 3, stride=2)
         >>> # non-square kernels and unequal stride and with padding
         >>> m = nnq.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
@@ -781,6 +788,7 @@ class ConvTranspose2d(_ConvTransposeNd):
         >>> h = downsample(q_input)
         >>> h.size()
         torch.Size([1, 16, 6, 6])
+        >>> # xdoctest: +SKIP("FIXME: output_size is not a parameter)
         >>> output = upsample(h, output_size=input.size())
         >>> output.size()
         torch.Size([1, 16, 12, 12])
@@ -834,6 +842,7 @@ class ConvTranspose2d(_ConvTransposeNd):
     def from_reference(cls, ref_qconvt, output_scale, output_zero_point):
         return _ConvTransposeNd.from_reference(cls, ref_qconvt, output_scale, output_zero_point)
 
+
 class ConvTranspose3d(_ConvTransposeNd):
     r"""Applies a 3D transposed convolution operator over an input image
     composed of several input planes.
@@ -854,9 +863,10 @@ class ConvTranspose3d(_ConvTransposeNd):
 
     Examples::
 
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
         >>> torch.backends.quantized.engine = 'fbgemm'
+        >>> from torch.nn import quantized as nnq
         >>> # With cubic kernels and equal stride
-        >>> # xdoctest: +SKIP
         >>> m = nnq.ConvTranspose3d(16, 33, 3, stride=2)
         >>> # non-cubic kernels and unequal stride and with padding
         >>> m = nnq.ConvTranspose3d(16, 33, (3, 3, 5), stride=(2, 1, 1), padding=(4, 2, 2))
@@ -871,6 +881,7 @@ class ConvTranspose3d(_ConvTransposeNd):
         >>> h = downsample(q_input)
         >>> h.size()
         torch.Size([1, 16, 6, 6, 6])
+        >>> # xdoctest: +SKIP("FIXME: output_size is not a parameter)
         >>> output = upsample(h, output_size=input.size())
         >>> output.size()
         torch.Size([1, 16, 12, 12, 12])
diff --git a/torch/nn/utils/parametrizations.py b/torch/nn/utils/parametrizations.py
index 3dd5192c1062..7b097f667671 100644
--- a/torch/nn/utils/parametrizations.py
+++ b/torch/nn/utils/parametrizations.py
@@ -10,6 +10,7 @@ from typing import Optional
 
 __all__ = ['orthogonal', 'spectral_norm']
 
+
 def _is_orthogonal(Q, eps=None):
     n, k = Q.size(-2), Q.size(-1)
     Id = torch.eye(k, dtype=Q.dtype, device=Q.device)
@@ -242,7 +243,7 @@ def orthogonal(module: Module,
 
     Example::
 
-        >>> # xdoctest: +REQUIRES(--lapack)
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
         >>> orth_linear = orthogonal(nn.Linear(20, 40))
         >>> orth_linear
         ParametrizedLinear(
@@ -459,19 +460,20 @@ def spectral_norm(module: Module,
 
     Example::
 
-        >>> # xdoctest: +REQUIRES(--lapack)
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> # xdoctest: +IGNORE_WANT("non-determenistic")
         >>> snm = spectral_norm(nn.Linear(20, 40))
         >>> snm
         ParametrizedLinear(
-        in_features=20, out_features=40, bias=True
-        (parametrizations): ModuleDict(
+          in_features=20, out_features=40, bias=True
+          (parametrizations): ModuleDict(
             (weight): ParametrizationList(
-            (0): _SpectralNorm()
+              (0): _SpectralNorm()
             )
-        )
+          )
         )
         >>> torch.linalg.matrix_norm(snm.weight, 2)
-        tensor(1.0000, grad_fn=<CopyBackwards>)
+        tensor(1.0081, grad_fn=<AmaxBackward0>)
     """
     weight = getattr(module, name, None)
     if not isinstance(weight, Tensor):
diff --git a/torch/nn/utils/parametrize.py b/torch/nn/utils/parametrize.py
index 32d71b42f9ca..b8f8d439c1b7 100644
--- a/torch/nn/utils/parametrize.py
+++ b/torch/nn/utils/parametrize.py
@@ -460,7 +460,7 @@ def register_parametrization(
         ValueError: if the module does not have a parameter or a buffer named :attr:`tensor_name`
 
     Examples:
-        >>> # xdoctest: +REQUIRES(--lapack)
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
         >>> import torch
         >>> import torch.nn as nn
         >>> import torch.nn.utils.parametrize as P
diff --git a/torch/onnx/_type_utils.py b/torch/onnx/_type_utils.py
index e565c6e9d664..70b1b3f868ec 100644
--- a/torch/onnx/_type_utils.py
+++ b/torch/onnx/_type_utils.py
@@ -55,6 +55,7 @@ class JitScalarType(enum.IntEnum):
     Use ``JitScalarType`` to convert from torch and JIT scalar types to ONNX scalar types.
 
     Examples::
+        >>> # xdoctest: +IGNORE_WANT("win32 has different output")
         >>> JitScalarType.from_name("Float").onnx_type()
         TensorProtoDataType.FLOAT
     """
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 2431d889d1a8..c3c40d1ef0f0 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -725,6 +725,7 @@ class PolynomialLR(_LRScheduler):
         >>> # lr = 0.00050   if epoch == 2
         >>> # lr = 0.00025   if epoch == 3
         >>> # lr = 0.0       if epoch >= 4
+        >>> # xdoctest: +SKIP("undefined vars")
         >>> scheduler = PolynomialLR(self.opt, total_iters=4, power=1.0)
         >>> for epoch in range(100):
         >>>     train(...)
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index 0b59c43736d3..4cf957034cbd 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -108,6 +108,7 @@ class IterableDataset(Dataset[T_co]):
         >>> print(list(torch.utils.data.DataLoader(ds, num_workers=0)))
         [tensor([3]), tensor([4]), tensor([5]), tensor([6])]
 
+        >>> # xdoctest: +REQUIRES(POSIX)
         >>> # Mult-process loading with two worker processes
         >>> # Worker 0 fetched [3, 4].  Worker 1 fetched [5, 6].
         >>> # xdoctest: +IGNORE_WANT("non deterministic")
@@ -116,7 +117,7 @@ class IterableDataset(Dataset[T_co]):
 
         >>> # With even more workers
         >>> # xdoctest: +IGNORE_WANT("non deterministic")
-        >>> print(list(torch.utils.data.DataLoader(ds, num_workers=20)))
+        >>> print(list(torch.utils.data.DataLoader(ds, num_workers=12)))
         [tensor([3]), tensor([5]), tensor([4]), tensor([6])]
 
     Example 2: splitting workload across all workers using :attr:`worker_init_fn`::
@@ -161,7 +162,7 @@ class IterableDataset(Dataset[T_co]):
         [3, 5, 4, 6]
 
         >>> # With even more workers
-        >>> print(list(torch.utils.data.DataLoader(ds, num_workers=20, worker_init_fn=worker_init_fn)))
+        >>> print(list(torch.utils.data.DataLoader(ds, num_workers=12, worker_init_fn=worker_init_fn)))
         [3, 4, 5, 6]
     """
     def __iter__(self) -> Iterator[T_co]:
diff --git a/torch/utils/dlpack.py b/torch/utils/dlpack.py
index 2fe5a5c1d2c0..ae0aafceb178 100644
--- a/torch/utils/dlpack.py
+++ b/torch/utils/dlpack.py
@@ -42,6 +42,7 @@ Args:
 The DLPack capsule shares the tensor's memory.
 """)
 
+
 # TODO: add a typing.Protocol to be able to tell Mypy that only objects with
 # __dlpack__ and __dlpack_device__ methods are accepted.
 def from_dlpack(ext_tensor: Any) -> torch.Tensor:
diff --git a/torch/utils/throughput_benchmark.py b/torch/utils/throughput_benchmark.py
index 7068f74d0906..1dae4b937783 100644
--- a/torch/utils/throughput_benchmark.py
+++ b/torch/utils/throughput_benchmark.py
@@ -1,6 +1,7 @@
 
 import torch._C
 
+
 def format_time(time_us=None, time_ms=None, time_s=None):
     '''Defines how to format time'''
     assert sum([time_us is not None, time_ms is not None, time_s is not None]) == 1
@@ -48,7 +49,6 @@ class ExecutionStats(object):
         return self.num_iters * (
             self.latency_avg_ms / 1000.0) / self.benchmark_config.num_calling_threads
 
-
     def __str__(self):
         return '\n'.join([
             "Average latency per example: " + format_time(time_ms=self.latency_avg_ms),