Doc test non packages (#110568)

Add non-package python modules to the public API checks. The original change is to remove the `ispkg` check in this line https://github.com/pytorch/pytorch/blob/main/docs/source/conf.py#L518 Everything else is to add the appropriate modules to the rst files, make sure every module we provide can be imported (fixed by either making optional dependencies optional or just deleting files that have been un-importable for 3 years), make API that are both modules and functions (like torch.autograd.gradcheck) properly rendered on the docs website without confusion and add every non-documented API to the allow list (~3k of them). Next steps will be to try and fix these missing docs Pull Request resolved: https://github.com/pytorch/pytorch/pull/110568 Approved by: https://github.com/zou3519
2025-10-20 21:14:14 +08:00 · 2023-10-06 14:16:01 +00:00
parent a3e5ec453a
commit c4db607607
26 changed files with 3820 additions and 662 deletions
--- a/docs/source/amp.rst
+++ b/docs/source/amp.rst
@ -384,3 +384,12 @@ Some ops not listed here (e.g., binary ops like ``add``) natively promote
 inputs without autocasting's intervention.  If inputs are a mixture of ``bfloat16``
 and ``float32``, these ops run in ``float32`` and produce ``float32`` output,
 regardless of whether autocast is enabled.
+
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.amp.autocast_mode
+.. py:module:: torch.cpu.amp.autocast_mode
+.. py:module:: torch.cuda.amp.autocast_mode
+.. py:module:: torch.cuda.amp.common
+.. py:module:: torch.cuda.amp.grad_scaler
--- a/docs/source/autograd.rst
+++ b/docs/source/autograd.rst
@ -214,6 +214,10 @@ When creating a new :class:`Function`, the following methods are available to `c
 Numerical gradient checking
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^

+
+.. automodule:: torch.autograd.gradcheck
+.. currentmodule:: torch.autograd.gradcheck
+
 .. autosummary::
    :toctree: generated
    :nosignatures:
@ -221,6 +225,9 @@ Numerical gradient checking
    gradcheck
    gradgradcheck

+.. Just to reset the base path for the rest of this file
+.. currentmodule:: torch.autograd
+
 Profiler
 ^^^^^^^^

@ -309,3 +316,17 @@ Also see :ref:`saved-tensors-hooks-doc`.
 .. autoclass:: torch.autograd.graph.register_multi_grad_hook

 .. autoclass:: torch.autograd.graph.allow_mutation_on_saved_tensors
+
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.autograd.anomaly_mode
+.. py:module:: torch.autograd.forward_ad
+.. py:module:: torch.autograd.function
+.. py:module:: torch.autograd.functional
+.. py:module:: torch.autograd.grad_mode
+.. py:module:: torch.autograd.graph
+.. py:module:: torch.autograd.profiler
+.. py:module:: torch.autograd.profiler_legacy
+.. py:module:: torch.autograd.profiler_util
+.. py:module:: torch.autograd.variable
--- a/docs/source/backends.rst
+++ b/docs/source/backends.rst
@ -117,6 +117,7 @@ torch.backends.cudnn
    available algorithm. Note that this setting only affects convolutions dispatched via the
    cuDNN v8 API.

+.. py:module:: torch.backends.cudnn.rnn

 torch.backends.mps
 ^^^^^^^^^^^^^^^^^^
@ -187,3 +188,4 @@ torch.backends.opt_einsum
 torch.backends.xeon
 ^^^^^^^^^^^^^^^^^^^
 .. automodule:: torch.backends.xeon
+.. py:module:: torch.backends.xeon.run_cpu
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
--- a/docs/source/cuda.rst
+++ b/docs/source/cuda.rst
@ -153,3 +153,18 @@ See the :doc:`documentation <cuda._sanitizer>` for information on how to use it.
    :hidden:

    cuda._sanitizer
+
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.cuda.comm
+.. py:module:: torch.cuda.error
+.. py:module:: torch.cuda.graphs
+.. py:module:: torch.cuda.jiterator
+.. py:module:: torch.cuda.memory
+.. py:module:: torch.cuda.nccl
+.. py:module:: torch.cuda.nvtx
+.. py:module:: torch.cuda.profiler
+.. py:module:: torch.cuda.random
+.. py:module:: torch.cuda.sparse
+.. py:module:: torch.cuda.streams
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@ -874,3 +874,118 @@ Distributed components raise custom Exception types derived from `RuntimeError`:
 .. py:module:: torch.distributed.pipeline.sync
 .. py:module:: torch.distributed.pipeline.sync.skip
 .. py:module:: torch.distributed.tensor
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.ddp_zero_hook
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.default_hooks
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.mixed_precision_hooks
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.optimizer_overlap_hooks
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.quantization_hooks
+.. py:module:: torch.distributed.algorithms.join
+.. py:module:: torch.distributed.algorithms.model_averaging.averagers
+.. py:module:: torch.distributed.algorithms.model_averaging.hierarchical_model_averager
+.. py:module:: torch.distributed.algorithms.model_averaging.utils
+.. py:module:: torch.distributed.argparse_util
+.. py:module:: torch.distributed.c10d_logger
+.. py:module:: torch.distributed.checkpoint.api
+.. py:module:: torch.distributed.checkpoint.default_planner
+.. py:module:: torch.distributed.checkpoint.filesystem
+.. py:module:: torch.distributed.checkpoint.metadata
+.. py:module:: torch.distributed.checkpoint.optimizer
+.. py:module:: torch.distributed.checkpoint.planner
+.. py:module:: torch.distributed.checkpoint.planner_helpers
+.. py:module:: torch.distributed.checkpoint.resharding
+.. py:module:: torch.distributed.checkpoint.state_dict_loader
+.. py:module:: torch.distributed.checkpoint.state_dict_saver
+.. py:module:: torch.distributed.checkpoint.storage
+.. py:module:: torch.distributed.checkpoint.utils
+.. py:module:: torch.distributed.collective_utils
+.. py:module:: torch.distributed.constants
+.. py:module:: torch.distributed.distributed_c10d
+.. py:module:: torch.distributed.elastic.agent.server.api
+.. py:module:: torch.distributed.elastic.agent.server.local_elastic_agent
+.. py:module:: torch.distributed.elastic.events.api
+.. py:module:: torch.distributed.elastic.events.handlers
+.. py:module:: torch.distributed.elastic.metrics.api
+.. py:module:: torch.distributed.elastic.multiprocessing.api
+.. py:module:: torch.distributed.elastic.multiprocessing.errors.error_handler
+.. py:module:: torch.distributed.elastic.multiprocessing.errors.handlers
+.. py:module:: torch.distributed.elastic.multiprocessing.redirects
+.. py:module:: torch.distributed.elastic.multiprocessing.tail_log
+.. py:module:: torch.distributed.elastic.rendezvous.api
+.. py:module:: torch.distributed.elastic.rendezvous.c10d_rendezvous_backend
+.. py:module:: torch.distributed.elastic.rendezvous.dynamic_rendezvous
+.. py:module:: torch.distributed.elastic.rendezvous.etcd_rendezvous
+.. py:module:: torch.distributed.elastic.rendezvous.etcd_rendezvous_backend
+.. py:module:: torch.distributed.elastic.rendezvous.etcd_server
+.. py:module:: torch.distributed.elastic.rendezvous.etcd_store
+.. py:module:: torch.distributed.elastic.rendezvous.static_tcp_rendezvous
+.. py:module:: torch.distributed.elastic.rendezvous.utils
+.. py:module:: torch.distributed.elastic.timer.api
+.. py:module:: torch.distributed.elastic.timer.file_based_local_timer
+.. py:module:: torch.distributed.elastic.timer.local_timer
+.. py:module:: torch.distributed.elastic.utils.api
+.. py:module:: torch.distributed.elastic.utils.data.cycling_iterator
+.. py:module:: torch.distributed.elastic.utils.data.elastic_distributed_sampler
+.. py:module:: torch.distributed.elastic.utils.distributed
+.. py:module:: torch.distributed.elastic.utils.log_level
+.. py:module:: torch.distributed.elastic.utils.logging
+.. py:module:: torch.distributed.elastic.utils.store
+.. py:module:: torch.distributed.fsdp.api
+.. py:module:: torch.distributed.fsdp.fully_sharded_data_parallel
+.. py:module:: torch.distributed.fsdp.sharded_grad_scaler
+.. py:module:: torch.distributed.fsdp.wrap
+.. py:module:: torch.distributed.launcher.api
+.. py:module:: torch.distributed.logging_handlers
+.. py:module:: torch.distributed.nn.api.remote_module
+.. py:module:: torch.distributed.nn.functional
+.. py:module:: torch.distributed.nn.jit.instantiator
+.. py:module:: torch.distributed.nn.jit.templates.remote_module_template
+.. py:module:: torch.distributed.optim.apply_optimizer_in_backward
+.. py:module:: torch.distributed.optim.functional_adadelta
+.. py:module:: torch.distributed.optim.functional_adagrad
+.. py:module:: torch.distributed.optim.functional_adam
+.. py:module:: torch.distributed.optim.functional_adamax
+.. py:module:: torch.distributed.optim.functional_adamw
+.. py:module:: torch.distributed.optim.functional_rmsprop
+.. py:module:: torch.distributed.optim.functional_rprop
+.. py:module:: torch.distributed.optim.functional_sgd
+.. py:module:: torch.distributed.optim.named_optimizer
+.. py:module:: torch.distributed.optim.optimizer
+.. py:module:: torch.distributed.optim.post_localSGD_optimizer
+.. py:module:: torch.distributed.optim.utils
+.. py:module:: torch.distributed.optim.zero_redundancy_optimizer
+.. py:module:: torch.distributed.pipeline.sync.batchnorm
+.. py:module:: torch.distributed.pipeline.sync.checkpoint
+.. py:module:: torch.distributed.pipeline.sync.copy
+.. py:module:: torch.distributed.pipeline.sync.dependency
+.. py:module:: torch.distributed.pipeline.sync.microbatch
+.. py:module:: torch.distributed.pipeline.sync.phony
+.. py:module:: torch.distributed.pipeline.sync.pipe
+.. py:module:: torch.distributed.pipeline.sync.pipeline
+.. py:module:: torch.distributed.pipeline.sync.skip.layout
+.. py:module:: torch.distributed.pipeline.sync.skip.namespace
+.. py:module:: torch.distributed.pipeline.sync.skip.portal
+.. py:module:: torch.distributed.pipeline.sync.skip.skippable
+.. py:module:: torch.distributed.pipeline.sync.skip.tracker
+.. py:module:: torch.distributed.pipeline.sync.stream
+.. py:module:: torch.distributed.pipeline.sync.utils
+.. py:module:: torch.distributed.pipeline.sync.worker
+.. py:module:: torch.distributed.remote_device
+.. py:module:: torch.distributed.rendezvous
+.. py:module:: torch.distributed.rpc.api
+.. py:module:: torch.distributed.rpc.backend_registry
+.. py:module:: torch.distributed.rpc.constants
+.. py:module:: torch.distributed.rpc.functions
+.. py:module:: torch.distributed.rpc.internal
+.. py:module:: torch.distributed.rpc.options
+.. py:module:: torch.distributed.rpc.rref_proxy
+.. py:module:: torch.distributed.rpc.server_process_global_profiler
+.. py:module:: torch.distributed.tensor.parallel.api
+.. py:module:: torch.distributed.tensor.parallel.ddp
+.. py:module:: torch.distributed.tensor.parallel.fsdp
+.. py:module:: torch.distributed.tensor.parallel.input_reshard
+.. py:module:: torch.distributed.tensor.parallel.style
+.. py:module:: torch.distributed.utils
+.. py:module:: torch.distributed.checkpoint.state_dict
--- a/docs/source/distributions.rst
+++ b/docs/source/distributions.rst
@ -394,3 +394,47 @@ Probability distributions - torch.distributions
 .. automodule:: torch.distributions.constraint_registry
    :members:
    :member-order: bysource
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.distributions.bernoulli
+.. py:module:: torch.distributions.beta
+.. py:module:: torch.distributions.binomial
+.. py:module:: torch.distributions.categorical
+.. py:module:: torch.distributions.cauchy
+.. py:module:: torch.distributions.chi2
+.. py:module:: torch.distributions.continuous_bernoulli
+.. py:module:: torch.distributions.dirichlet
+.. py:module:: torch.distributions.distribution
+.. py:module:: torch.distributions.exp_family
+.. py:module:: torch.distributions.exponential
+.. py:module:: torch.distributions.fishersnedecor
+.. py:module:: torch.distributions.gamma
+.. py:module:: torch.distributions.geometric
+.. py:module:: torch.distributions.gumbel
+.. py:module:: torch.distributions.half_cauchy
+.. py:module:: torch.distributions.half_normal
+.. py:module:: torch.distributions.independent
+.. py:module:: torch.distributions.kumaraswamy
+.. py:module:: torch.distributions.laplace
+.. py:module:: torch.distributions.lkj_cholesky
+.. py:module:: torch.distributions.log_normal
+.. py:module:: torch.distributions.logistic_normal
+.. py:module:: torch.distributions.lowrank_multivariate_normal
+.. py:module:: torch.distributions.mixture_same_family
+.. py:module:: torch.distributions.multinomial
+.. py:module:: torch.distributions.multivariate_normal
+.. py:module:: torch.distributions.negative_binomial
+.. py:module:: torch.distributions.normal
+.. py:module:: torch.distributions.one_hot_categorical
+.. py:module:: torch.distributions.pareto
+.. py:module:: torch.distributions.poisson
+.. py:module:: torch.distributions.relaxed_bernoulli
+.. py:module:: torch.distributions.relaxed_categorical
+.. py:module:: torch.distributions.studentT
+.. py:module:: torch.distributions.transformed_distribution
+.. py:module:: torch.distributions.uniform
+.. py:module:: torch.distributions.utils
+.. py:module:: torch.distributions.von_mises
+.. py:module:: torch.distributions.weibull
+.. py:module:: torch.distributions.wishart
--- a/docs/source/export.rst
+++ b/docs/source/export.rst
@ -577,3 +577,8 @@ API Reference
 .. autoclass:: ExportGraphSignature
 .. autoclass:: ModuleCallSignature
 .. autoclass:: ModuleCallEntry
+
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.export.exported_program
--- a/docs/source/fx.rst
+++ b/docs/source/fx.rst
@ -1125,3 +1125,75 @@ API Reference
 .. py:module:: torch.fx.experimental.migrate_gradual_types
 .. py:module:: torch.fx.passes.dialect
 .. py:module:: torch.fx.passes.dialect.common
+.. py:module:: torch.fx.annotate
+.. py:module:: torch.fx.config
+.. py:module:: torch.fx.experimental.accelerator_partitioner
+.. py:module:: torch.fx.experimental.const_fold
+.. py:module:: torch.fx.experimental.debug
+.. py:module:: torch.fx.experimental.graph_gradual_typechecker
+.. py:module:: torch.fx.experimental.merge_matmul
+.. py:module:: torch.fx.experimental.meta_tracer
+.. py:module:: torch.fx.experimental.migrate_gradual_types.constraint
+.. py:module:: torch.fx.experimental.migrate_gradual_types.constraint_generator
+.. py:module:: torch.fx.experimental.migrate_gradual_types.constraint_transformation
+.. py:module:: torch.fx.experimental.migrate_gradual_types.operation
+.. py:module:: torch.fx.experimental.migrate_gradual_types.transform_to_z3
+.. py:module:: torch.fx.experimental.migrate_gradual_types.util
+.. py:module:: torch.fx.experimental.migrate_gradual_types.z3_types
+.. py:module:: torch.fx.experimental.normalize
+.. py:module:: torch.fx.experimental.optimization
+.. py:module:: torch.fx.experimental.partitioner_utils
+.. py:module:: torch.fx.experimental.proxy_tensor
+.. py:module:: torch.fx.experimental.recording
+.. py:module:: torch.fx.experimental.refinement_types
+.. py:module:: torch.fx.experimental.rewriter
+.. py:module:: torch.fx.experimental.schema_type_annotation
+.. py:module:: torch.fx.experimental.symbolic_shapes
+.. py:module:: torch.fx.experimental.unification.core
+.. py:module:: torch.fx.experimental.unification.dispatch
+.. py:module:: torch.fx.experimental.unification.match
+.. py:module:: torch.fx.experimental.unification.more
+.. py:module:: torch.fx.experimental.unification.multipledispatch.conflict
+.. py:module:: torch.fx.experimental.unification.multipledispatch.core
+.. py:module:: torch.fx.experimental.unification.multipledispatch.dispatcher
+.. py:module:: torch.fx.experimental.unification.multipledispatch.utils
+.. py:module:: torch.fx.experimental.unification.multipledispatch.variadic
+.. py:module:: torch.fx.experimental.unification.unification_tools
+.. py:module:: torch.fx.experimental.unification.utils
+.. py:module:: torch.fx.experimental.unification.variable
+.. py:module:: torch.fx.experimental.unify_refinements
+.. py:module:: torch.fx.experimental.validator
+.. py:module:: torch.fx.graph
+.. py:module:: torch.fx.graph_module
+.. py:module:: torch.fx.immutable_collections
+.. py:module:: torch.fx.interpreter
+.. py:module:: torch.fx.node
+.. py:module:: torch.fx.operator_schemas
+.. py:module:: torch.fx.passes.annotate_getitem_nodes
+.. py:module:: torch.fx.passes.backends.cudagraphs
+.. py:module:: torch.fx.passes.dialect.common.cse_pass
+.. py:module:: torch.fx.passes.fake_tensor_prop
+.. py:module:: torch.fx.passes.graph_drawer
+.. py:module:: torch.fx.passes.graph_manipulation
+.. py:module:: torch.fx.passes.infra.partitioner
+.. py:module:: torch.fx.passes.infra.pass_base
+.. py:module:: torch.fx.passes.infra.pass_manager
+.. py:module:: torch.fx.passes.net_min_base
+.. py:module:: torch.fx.passes.operator_support
+.. py:module:: torch.fx.passes.param_fetch
+.. py:module:: torch.fx.passes.pass_manager
+.. py:module:: torch.fx.passes.reinplace
+.. py:module:: torch.fx.passes.shape_prop
+.. py:module:: torch.fx.passes.split_module
+.. py:module:: torch.fx.passes.split_utils
+.. py:module:: torch.fx.passes.splitter_base
+.. py:module:: torch.fx.passes.tests.test_pass_manager
+.. py:module:: torch.fx.passes.tools_common
+.. py:module:: torch.fx.passes.utils.common
+.. py:module:: torch.fx.passes.utils.fuser_utils
+.. py:module:: torch.fx.passes.utils.matcher_utils
+.. py:module:: torch.fx.passes.utils.source_matcher_utils
+.. py:module:: torch.fx.proxy
+.. py:module:: torch.fx.subgraph_rewriter
+.. py:module:: torch.fx.tensor_type
+.. py:module:: torch.fx.traceback
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@ -886,3 +886,7 @@ References
 .. This package is missing doc. Adding it here for coverage
 .. This does not add anything to the rendered page.
 .. py:module:: torch.jit.mobile
+.. py:module:: torch.jit.annotations
+.. py:module:: torch.jit.frontend
+.. py:module:: torch.jit.generate_bytecode
+.. py:module:: torch.jit.quantized
--- a/docs/source/library.rst
+++ b/docs/source/library.rst
@ -1,5 +1,6 @@
 torch.library
 ===================================
+.. py:module:: torch.library

 Python operator registration API provides capabilities for extending PyTorch's core library
 of operators with user defined operators. Currently, this can be done in two ways:
--- a/docs/source/masked.rst
+++ b/docs/source/masked.rst
@ -295,3 +295,12 @@ The following ops are currently supported:
    Tensor.reshape
    Tensor.reshape_as
    Tensor.view
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.masked.maskedtensor.binary
+.. py:module:: torch.masked.maskedtensor.core
+.. py:module:: torch.masked.maskedtensor.creation
+.. py:module:: torch.masked.maskedtensor.passthrough
+.. py:module:: torch.masked.maskedtensor.reductions
+.. py:module:: torch.masked.maskedtensor.unary
--- a/docs/source/mps.rst
+++ b/docs/source/mps.rst
@ -34,3 +34,9 @@ MPS Event
    :nosignatures:

    event.Event
+
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.mps.event
+.. py:module:: torch.mps.profiler
--- a/docs/source/multiprocessing.rst
+++ b/docs/source/multiprocessing.rst
@ -174,10 +174,23 @@ The ``spawn`` function below addresses these concerns and takes care
 of error propagation, out of order termination, and will actively
 terminate processes upon detecting an error in one of them.

+.. automodule:: torch.multiprocessing.spawn
+.. currentmodule:: torch.multiprocessing.spawn
+
 .. autofunction:: spawn

+.. currentmodule:: torch.multiprocessing
+
+
 .. class:: SpawnContext

   Returned by :func:`~spawn` when called with ``join=False``.

   .. automethod:: join
+
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.multiprocessing.pool
+.. py:module:: torch.multiprocessing.queue
+.. py:module:: torch.multiprocessing.reductions
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@ -477,6 +477,55 @@ Lazy Modules Initialization
    nn.modules.lazy.LazyModuleMixin


-.. This module is kept only for backward compatibility
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
 .. py:module:: torch.nn.backends
 .. py:module:: torch.nn.utils.stateless
+.. py:module:: torch.nn.backends.thnn
+.. py:module:: torch.nn.common_types
+.. py:module:: torch.nn.cpp
+.. py:module:: torch.nn.functional
+.. py:module:: torch.nn.grad
+.. py:module:: torch.nn.init
+.. py:module:: torch.nn.modules.activation
+.. py:module:: torch.nn.modules.adaptive
+.. py:module:: torch.nn.modules.batchnorm
+.. py:module:: torch.nn.modules.channelshuffle
+.. py:module:: torch.nn.modules.container
+.. py:module:: torch.nn.modules.conv
+.. py:module:: torch.nn.modules.distance
+.. py:module:: torch.nn.modules.dropout
+.. py:module:: torch.nn.modules.flatten
+.. py:module:: torch.nn.modules.fold
+.. py:module:: torch.nn.modules.instancenorm
+.. py:module:: torch.nn.modules.lazy
+.. py:module:: torch.nn.modules.linear
+.. py:module:: torch.nn.modules.loss
+.. py:module:: torch.nn.modules.module
+.. py:module:: torch.nn.modules.normalization
+.. py:module:: torch.nn.modules.padding
+.. py:module:: torch.nn.modules.pixelshuffle
+.. py:module:: torch.nn.modules.pooling
+.. py:module:: torch.nn.modules.rnn
+.. py:module:: torch.nn.modules.sparse
+.. py:module:: torch.nn.modules.transformer
+.. py:module:: torch.nn.modules.upsampling
+.. py:module:: torch.nn.modules.utils
+.. py:module:: torch.nn.parallel.comm
+.. py:module:: torch.nn.parallel.data_parallel
+.. py:module:: torch.nn.parallel.distributed
+.. py:module:: torch.nn.parallel.parallel_apply
+.. py:module:: torch.nn.parallel.replicate
+.. py:module:: torch.nn.parallel.scatter_gather
+.. py:module:: torch.nn.parameter
+.. py:module:: torch.nn.utils.clip_grad
+.. py:module:: torch.nn.utils.convert_parameters
+.. py:module:: torch.nn.utils.fusion
+.. py:module:: torch.nn.utils.init
+.. py:module:: torch.nn.utils.memory_format
+.. py:module:: torch.nn.utils.parametrizations
+.. py:module:: torch.nn.utils.parametrize
+.. py:module:: torch.nn.utils.prune
+.. py:module:: torch.nn.utils.rnn
+.. py:module:: torch.nn.utils.spectral_norm
+.. py:module:: torch.nn.utils.weight_norm
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@ -62,3 +62,24 @@ also be interested in reading our `development wiki <https://github.com/pytorch/
    onnx_dynamo
    onnx_dynamo_onnxruntime_backend
    onnx_torchscript
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.onnx.errors
+.. py:module:: torch.onnx.operators
+.. py:module:: torch.onnx.symbolic_caffe2
+.. py:module:: torch.onnx.symbolic_helper
+.. py:module:: torch.onnx.symbolic_opset10
+.. py:module:: torch.onnx.symbolic_opset11
+.. py:module:: torch.onnx.symbolic_opset13
+.. py:module:: torch.onnx.symbolic_opset14
+.. py:module:: torch.onnx.symbolic_opset15
+.. py:module:: torch.onnx.symbolic_opset16
+.. py:module:: torch.onnx.symbolic_opset17
+.. py:module:: torch.onnx.symbolic_opset18
+.. py:module:: torch.onnx.symbolic_opset7
+.. py:module:: torch.onnx.symbolic_opset8
+.. py:module:: torch.onnx.symbolic_opset9
+.. py:module:: torch.onnx.utils
+.. py:module:: torch.onnx.verification
+.. py:module:: torch.onnx.symbolic_opset12
--- a/docs/source/optim.rst
+++ b/docs/source/optim.rst
@ -409,3 +409,23 @@ We train the model for a total of 300 epochs and start to collect EMA averages i
 >>> torch.optim.swa_utils.update_bn(loader, ema_model)
 >>> # Use ema_model to make predictions on test data
 >>> preds = ema_model(test_input)
+
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.optim.adadelta
+.. py:module:: torch.optim.adagrad
+.. py:module:: torch.optim.adam
+.. py:module:: torch.optim.adamax
+.. py:module:: torch.optim.adamw
+.. py:module:: torch.optim.asgd
+.. py:module:: torch.optim.lbfgs
+.. py:module:: torch.optim.lr_scheduler
+.. py:module:: torch.optim.nadam
+.. py:module:: torch.optim.optimizer
+.. py:module:: torch.optim.radam
+.. py:module:: torch.optim.rmsprop
+.. py:module:: torch.optim.rprop
+.. py:module:: torch.optim.sgd
+.. py:module:: torch.optim.sparse_adam
+.. py:module:: torch.optim.swa_utils
--- a/docs/source/package.rst
+++ b/docs/source/package.rst
@ -515,7 +515,7 @@ The ``torch.package`` format makes no guarantees about the contents of ``.data/`
 Currently, the ``.data/`` directory contains the following items:

 * ``version``: a version number for the serialized format, so that the ``torch.package`` import infrastructures knows how to load this package.
-* ``extern_modules``: a list of modules that are considered ``extern:class:`PackageImporter`. ``extern`` modules will be imported using the loading environment’s system importer.
+* ``extern_modules``: a list of modules that are considered ``extern``. ``extern`` modules will be imported using the loading environment’s system importer.
 * ``*.storage``: serialized tensor data.


@ -817,3 +817,16 @@ API Reference

 .. autoclass:: torch.package.Directory
  :members:
+
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.package.analyze.find_first_use_of_broken_modules
+.. py:module:: torch.package.analyze.is_from_package
+.. py:module:: torch.package.analyze.trace_dependencies
+.. py:module:: torch.package.file_structure_representation
+.. py:module:: torch.package.find_file_dependencies
+.. py:module:: torch.package.glob_group
+.. py:module:: torch.package.importer
+.. py:module:: torch.package.package_exporter
+.. py:module:: torch.package.package_importer
--- a/docs/source/profiler.rst
+++ b/docs/source/profiler.rst
@ -37,3 +37,9 @@ Intel Instrumentation and Tracing Technology APIs
 .. autofunction:: torch.profiler.itt.range_push

 .. autofunction:: torch.profiler.itt.range_pop
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.profiler.itt
+.. py:module:: torch.profiler.profiler
+.. py:module:: torch.profiler.python_tracer
--- a/docs/source/quantization-support.rst
+++ b/docs/source/quantization-support.rst
@ -31,7 +31,7 @@ Preparing model for quantization
    :nosignatures:
    :template: classtemplate.rst

-    fuse_modules
+    fuse_modules.fuse_modules
    QuantStub
    DeQuantStub
    QuantWrapper
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@ -427,7 +427,7 @@ to do the following in addition:
   determine output quantization parameters.
 3. Fuse modules: combine operations/modules into a single module to obtain
   higher accuracy and performance. This is done using the
-   :func:`~torch.ao.quantization.fuse_modules` API, which takes in lists of modules
+   :func:`~torch.ao.quantization.fuse_modules.fuse_modules` API, which takes in lists of modules
   to be fused. We currently support the following fusions:
   [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu]

@ -1246,3 +1246,159 @@ Please take a look at `Limitations of Symbolic Tracing <https://pytorch.org/docs
 .. py:module:: torch.ao.pruning
 .. py:module:: torch.ao.pruning.scheduler
 .. py:module:: torch.ao.pruning.sparsifier
+.. py:module:: torch.ao.nn.intrinsic.modules.fused
+.. py:module:: torch.ao.nn.intrinsic.qat.modules.conv_fused
+.. py:module:: torch.ao.nn.intrinsic.qat.modules.linear_fused
+.. py:module:: torch.ao.nn.intrinsic.qat.modules.linear_relu
+.. py:module:: torch.ao.nn.intrinsic.quantized.dynamic.modules.linear_relu
+.. py:module:: torch.ao.nn.intrinsic.quantized.modules.bn_relu
+.. py:module:: torch.ao.nn.intrinsic.quantized.modules.conv_add
+.. py:module:: torch.ao.nn.intrinsic.quantized.modules.conv_relu
+.. py:module:: torch.ao.nn.intrinsic.quantized.modules.linear_relu
+.. py:module:: torch.ao.nn.qat.dynamic.modules.linear
+.. py:module:: torch.ao.nn.qat.modules.conv
+.. py:module:: torch.ao.nn.qat.modules.embedding_ops
+.. py:module:: torch.ao.nn.qat.modules.linear
+.. py:module:: torch.ao.nn.quantizable.modules.activation
+.. py:module:: torch.ao.nn.quantizable.modules.rnn
+.. py:module:: torch.ao.nn.quantized.dynamic.modules.conv
+.. py:module:: torch.ao.nn.quantized.dynamic.modules.linear
+.. py:module:: torch.ao.nn.quantized.dynamic.modules.rnn
+.. py:module:: torch.ao.nn.quantized.modules.activation
+.. py:module:: torch.ao.nn.quantized.modules.batchnorm
+.. py:module:: torch.ao.nn.quantized.modules.conv
+.. py:module:: torch.ao.nn.quantized.modules.dropout
+.. py:module:: torch.ao.nn.quantized.modules.embedding_ops
+.. py:module:: torch.ao.nn.quantized.modules.functional_modules
+.. py:module:: torch.ao.nn.quantized.modules.linear
+.. py:module:: torch.ao.nn.quantized.modules.normalization
+.. py:module:: torch.ao.nn.quantized.modules.rnn
+.. py:module:: torch.ao.nn.quantized.modules.utils
+.. py:module:: torch.ao.nn.quantized.reference.modules.conv
+.. py:module:: torch.ao.nn.quantized.reference.modules.linear
+.. py:module:: torch.ao.nn.quantized.reference.modules.rnn
+.. py:module:: torch.ao.nn.quantized.reference.modules.sparse
+.. py:module:: torch.ao.nn.quantized.reference.modules.utils
+.. py:module:: torch.ao.nn.sparse.quantized.dynamic.linear
+.. py:module:: torch.ao.nn.sparse.quantized.linear
+.. py:module:: torch.ao.nn.sparse.quantized.utils
+.. py:module:: torch.ao.ns.fx.graph_matcher
+.. py:module:: torch.ao.ns.fx.graph_passes
+.. py:module:: torch.ao.ns.fx.mappings
+.. py:module:: torch.ao.ns.fx.n_shadows_utils
+.. py:module:: torch.ao.ns.fx.ns_types
+.. py:module:: torch.ao.ns.fx.pattern_utils
+.. py:module:: torch.ao.ns.fx.qconfig_multi_mapping
+.. py:module:: torch.ao.ns.fx.utils
+.. py:module:: torch.ao.ns.fx.weight_utils
+.. py:module:: torch.ao.pruning.scheduler.base_scheduler
+.. py:module:: torch.ao.pruning.scheduler.cubic_scheduler
+.. py:module:: torch.ao.pruning.scheduler.lambda_scheduler
+.. py:module:: torch.ao.pruning.sparsifier.base_sparsifier
+.. py:module:: torch.ao.pruning.sparsifier.nearly_diagonal_sparsifier
+.. py:module:: torch.ao.pruning.sparsifier.utils
+.. py:module:: torch.ao.pruning.sparsifier.weight_norm_sparsifier
+.. py:module:: torch.ao.quantization.backend_config.backend_config
+.. py:module:: torch.ao.quantization.backend_config.executorch
+.. py:module:: torch.ao.quantization.backend_config.fbgemm
+.. py:module:: torch.ao.quantization.backend_config.native
+.. py:module:: torch.ao.quantization.backend_config.observation_type
+.. py:module:: torch.ao.quantization.backend_config.onednn
+.. py:module:: torch.ao.quantization.backend_config.qnnpack
+.. py:module:: torch.ao.quantization.backend_config.tensorrt
+.. py:module:: torch.ao.quantization.backend_config.utils
+.. py:module:: torch.ao.quantization.backend_config.x86
+.. py:module:: torch.ao.quantization.fake_quantize
+.. py:module:: torch.ao.quantization.fuser_method_mappings
+.. py:module:: torch.ao.quantization.fuse_modules
+.. py:module:: torch.ao.quantization.fx.convert
+.. py:module:: torch.ao.quantization.fx.custom_config
+.. py:module:: torch.ao.quantization.fx.fuse
+.. py:module:: torch.ao.quantization.fx.fuse_handler
+.. py:module:: torch.ao.quantization.fx.graph_module
+.. py:module:: torch.ao.quantization.fx.lower_to_fbgemm
+.. py:module:: torch.ao.quantization.fx.lower_to_qnnpack
+.. py:module:: torch.ao.quantization.fx.lstm_utils
+.. py:module:: torch.ao.quantization.fx.match_utils
+.. py:module:: torch.ao.quantization.fx.pattern_utils
+.. py:module:: torch.ao.quantization.fx.prepare
+.. py:module:: torch.ao.quantization.fx.qconfig_mapping_utils
+.. py:module:: torch.ao.quantization.fx.quantize_handler
+.. py:module:: torch.ao.quantization.fx.tracer
+.. py:module:: torch.ao.quantization.fx.utils
+.. py:module:: torch.ao.quantization.observer
+.. py:module:: torch.ao.quantization.pt2e.duplicate_dq_pass
+.. py:module:: torch.ao.quantization.pt2e.eval_utils
+.. py:module:: torch.ao.quantization.pt2e.graph_utils
+.. py:module:: torch.ao.quantization.pt2e.port_metadata_pass
+.. py:module:: torch.ao.quantization.pt2e.prepare
+.. py:module:: torch.ao.quantization.pt2e.qat_utils
+.. py:module:: torch.ao.quantization.pt2e.representation.rewrite
+.. py:module:: torch.ao.quantization.pt2e.utils
+.. py:module:: torch.ao.quantization.qconfig
+.. py:module:: torch.ao.quantization.qconfig_mapping
+.. py:module:: torch.ao.quantization.quant_type
+.. py:module:: torch.ao.quantization.quantization_mappings
+.. py:module:: torch.ao.quantization.quantize
+.. py:module:: torch.ao.quantization.quantize_fx
+.. py:module:: torch.ao.quantization.quantize_jit
+.. py:module:: torch.ao.quantization.quantize_pt2e
+.. py:module:: torch.ao.quantization.quantizer.composable_quantizer
+.. py:module:: torch.ao.quantization.quantizer.embedding_quantizer
+.. py:module:: torch.ao.quantization.quantizer.quantizer
+.. py:module:: torch.ao.quantization.quantizer.utils
+.. py:module:: torch.ao.quantization.quantizer.x86_inductor_quantizer
+.. py:module:: torch.ao.quantization.quantizer.xnnpack_quantizer
+.. py:module:: torch.ao.quantization.quantizer.xnnpack_quantizer_utils
+.. py:module:: torch.ao.quantization.stubs
+.. py:module:: torch.ao.quantization.utils
+.. py:module:: torch.nn.intrinsic.modules.fused
+.. py:module:: torch.nn.intrinsic.qat.modules.conv_fused
+.. py:module:: torch.nn.intrinsic.qat.modules.linear_fused
+.. py:module:: torch.nn.intrinsic.qat.modules.linear_relu
+.. py:module:: torch.nn.intrinsic.quantized.dynamic.modules.linear_relu
+.. py:module:: torch.nn.intrinsic.quantized.modules.bn_relu
+.. py:module:: torch.nn.intrinsic.quantized.modules.conv_relu
+.. py:module:: torch.nn.intrinsic.quantized.modules.linear_relu
+.. py:module:: torch.nn.qat.dynamic.modules.linear
+.. py:module:: torch.nn.qat.modules.conv
+.. py:module:: torch.nn.qat.modules.embedding_ops
+.. py:module:: torch.nn.qat.modules.linear
+.. py:module:: torch.nn.quantizable.modules.activation
+.. py:module:: torch.nn.quantizable.modules.rnn
+.. py:module:: torch.nn.quantized.dynamic.modules.conv
+.. py:module:: torch.nn.quantized.dynamic.modules.linear
+.. py:module:: torch.nn.quantized.dynamic.modules.rnn
+.. py:module:: torch.nn.quantized.functional
+.. py:module:: torch.nn.quantized.modules.activation
+.. py:module:: torch.nn.quantized.modules.batchnorm
+.. py:module:: torch.nn.quantized.modules.conv
+.. py:module:: torch.nn.quantized.modules.dropout
+.. py:module:: torch.nn.quantized.modules.embedding_ops
+.. py:module:: torch.nn.quantized.modules.functional_modules
+.. py:module:: torch.nn.quantized.modules.linear
+.. py:module:: torch.nn.quantized.modules.normalization
+.. py:module:: torch.nn.quantized.modules.rnn
+.. py:module:: torch.nn.quantized.modules.utils
+.. py:module:: torch.quantization.fake_quantize
+.. py:module:: torch.quantization.fuse_modules
+.. py:module:: torch.quantization.fuser_method_mappings
+.. py:module:: torch.quantization.fx.convert
+.. py:module:: torch.quantization.fx.fuse
+.. py:module:: torch.quantization.fx.fusion_patterns
+.. py:module:: torch.quantization.fx.graph_module
+.. py:module:: torch.quantization.fx.match_utils
+.. py:module:: torch.quantization.fx.pattern_utils
+.. py:module:: torch.quantization.fx.prepare
+.. py:module:: torch.quantization.fx.quantization_patterns
+.. py:module:: torch.quantization.fx.quantization_types
+.. py:module:: torch.quantization.fx.utils
+.. py:module:: torch.quantization.observer
+.. py:module:: torch.quantization.qconfig
+.. py:module:: torch.quantization.quant_type
+.. py:module:: torch.quantization.quantization_mappings
+.. py:module:: torch.quantization.quantize
+.. py:module:: torch.quantization.quantize_fx
+.. py:module:: torch.quantization.quantize_jit
+.. py:module:: torch.quantization.stubs
+.. py:module:: torch.quantization.utils
--- a/docs/source/torch.overrides.rst
+++ b/docs/source/torch.overrides.rst
@ -2,6 +2,7 @@

 torch.overrides
 ---------------
+.. py:module:: torch.overrides

 This module exposes various helper functions for the ``__torch_function__``
 protocol. See :ref:`extending-torch-python` for more details on the
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@ -744,3 +744,13 @@ Operator Tags
 .. for tracking purposes
 .. py:module:: torch.utils.model_dump
 .. py:module:: torch.utils.viz
+.. py:module:: torch.functional
+.. py:module:: torch.quasirandom
+.. py:module:: torch.return_types
+.. py:module:: torch.serialization
+.. py:module:: torch.signal.windows.windows
+.. py:module:: torch.sparse.semi_structured
+.. py:module:: torch.storage
+.. py:module:: torch.torch_version
+.. py:module:: torch.types
+.. py:module:: torch.version
--- a/docs/source/utils.rst
+++ b/docs/source/utils.rst
@ -11,3 +11,79 @@ torch.utils
    generate_methods_for_privateuse1_backend
    get_cpp_backtrace
    set_module
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.utils.backend_registration
+.. py:module:: torch.utils.benchmark.examples.blas_compare_setup
+.. py:module:: torch.utils.benchmark.examples.compare
+.. py:module:: torch.utils.benchmark.examples.fuzzer
+.. py:module:: torch.utils.benchmark.examples.op_benchmark
+.. py:module:: torch.utils.benchmark.examples.simple_timeit
+.. py:module:: torch.utils.benchmark.examples.spectral_ops_fuzz_test
+.. py:module:: torch.utils.benchmark.op_fuzzers.binary
+.. py:module:: torch.utils.benchmark.op_fuzzers.sparse_binary
+.. py:module:: torch.utils.benchmark.op_fuzzers.sparse_unary
+.. py:module:: torch.utils.benchmark.op_fuzzers.spectral
+.. py:module:: torch.utils.benchmark.op_fuzzers.unary
+.. py:module:: torch.utils.benchmark.utils.common
+.. py:module:: torch.utils.benchmark.utils.compare
+.. py:module:: torch.utils.benchmark.utils.compile
+.. py:module:: torch.utils.benchmark.utils.cpp_jit
+.. py:module:: torch.utils.benchmark.utils.fuzzer
+.. py:module:: torch.utils.benchmark.utils.sparse_fuzzer
+.. py:module:: torch.utils.benchmark.utils.timer
+.. py:module:: torch.utils.benchmark.utils.valgrind_wrapper.timer_interface
+.. py:module:: torch.utils.bundled_inputs
+.. py:module:: torch.utils.checkpoint
+.. py:module:: torch.utils.collect_env
+.. py:module:: torch.utils.cpp_backtrace
+.. py:module:: torch.utils.cpp_extension
+.. py:module:: torch.utils.data.backward_compatibility
+.. py:module:: torch.utils.data.dataloader
+.. py:module:: torch.utils.data.datapipes.dataframe.dataframe_wrapper
+.. py:module:: torch.utils.data.datapipes.dataframe.dataframes
+.. py:module:: torch.utils.data.datapipes.dataframe.datapipes
+.. py:module:: torch.utils.data.datapipes.dataframe.structures
+.. py:module:: torch.utils.data.datapipes.datapipe
+.. py:module:: torch.utils.data.datapipes.gen_pyi
+.. py:module:: torch.utils.data.datapipes.iter.callable
+.. py:module:: torch.utils.data.datapipes.iter.combinatorics
+.. py:module:: torch.utils.data.datapipes.iter.combining
+.. py:module:: torch.utils.data.datapipes.iter.filelister
+.. py:module:: torch.utils.data.datapipes.iter.fileopener
+.. py:module:: torch.utils.data.datapipes.iter.grouping
+.. py:module:: torch.utils.data.datapipes.iter.routeddecoder
+.. py:module:: torch.utils.data.datapipes.iter.selecting
+.. py:module:: torch.utils.data.datapipes.iter.sharding
+.. py:module:: torch.utils.data.datapipes.iter.streamreader
+.. py:module:: torch.utils.data.datapipes.iter.utils
+.. py:module:: torch.utils.data.datapipes.map.callable
+.. py:module:: torch.utils.data.datapipes.map.combinatorics
+.. py:module:: torch.utils.data.datapipes.map.combining
+.. py:module:: torch.utils.data.datapipes.map.grouping
+.. py:module:: torch.utils.data.datapipes.map.utils
+.. py:module:: torch.utils.data.datapipes.utils.common
+.. py:module:: torch.utils.data.datapipes.utils.decoder
+.. py:module:: torch.utils.data.datapipes.utils.snapshot
+.. py:module:: torch.utils.data.dataset
+.. py:module:: torch.utils.data.distributed
+.. py:module:: torch.utils.data.graph
+.. py:module:: torch.utils.data.graph_settings
+.. py:module:: torch.utils.data.sampler
+.. py:module:: torch.utils.dlpack
+.. py:module:: torch.utils.file_baton
+.. py:module:: torch.utils.flop_counter
+.. py:module:: torch.utils.hipify.constants
+.. py:module:: torch.utils.hipify.cuda_to_hip_mappings
+.. py:module:: torch.utils.hipify.hipify_python
+.. py:module:: torch.utils.hipify.version
+.. py:module:: torch.utils.hooks
+.. py:module:: torch.utils.jit.log_extract
+.. py:module:: torch.utils.mkldnn
+.. py:module:: torch.utils.mobile_optimizer
+.. py:module:: torch.utils.show_pickle
+.. py:module:: torch.utils.tensorboard.summary
+.. py:module:: torch.utils.tensorboard.writer
+.. py:module:: torch.utils.throughput_benchmark
+.. py:module:: torch.utils.weak
--- a/torch/utils/benchmark/examples/blas_compare.py
+++ b/torch/utils/benchmark/examples/blas_compare.py
@ -1,231 +0,0 @@
-import argparse
-import datetime
-import itertools as it
-import multiprocessing
-import multiprocessing.dummy
-import os
-import queue
-import pickle
-import shutil
-import subprocess
-import sys
-import tempfile
-import threading
-import time
-from typing import Tuple, Dict
-
-from . import blas_compare_setup
-
-
-MIN_RUN_TIME = 1
-NUM_REPLICATES = 20
-NUM_THREAD_SETTINGS = (1, 2, 4)
-RESULT_FILE = os.path.join(blas_compare_setup.WORKING_ROOT, "blas_results.pkl")
-SCRATCH_DIR = os.path.join(blas_compare_setup.WORKING_ROOT, "scratch")
-
-
-BLAS_CONFIGS = (
-    ("MKL (2020.3)", blas_compare_setup.MKL_2020_3, None),
-    ("MKL (2020.0)", blas_compare_setup.MKL_2020_0, None),
-    ("OpenBLAS", blas_compare_setup.OPEN_BLAS, None)
-)
-
-
-_RESULT_FILE_LOCK = threading.Lock()
-_WORKER_POOL: queue.Queue[Tuple[str, str, int]] = queue.Queue()
-def clear_worker_pool():
-    while not _WORKER_POOL.empty():
-        _, result_file, _ = _WORKER_POOL.get_nowait()
-        os.remove(result_file)
-
-    if os.path.exists(SCRATCH_DIR):
-        shutil.rmtree(SCRATCH_DIR)
-
-
-def fill_core_pool(n: int):
-    clear_worker_pool()
-    os.makedirs(SCRATCH_DIR)
-
-    # Reserve two cores so that bookkeeping does not interfere with runs.
-    cpu_count = multiprocessing.cpu_count() - 2
-
-    # Adjacent cores sometimes share cache, so we space out single core runs.
-    step = max(n, 2)
-    for i in range(0, cpu_count, step):
-        core_str = f"{i}" if n == 1 else f"{i},{i + n - 1}"
-        _, result_file = tempfile.mkstemp(suffix=".pkl", prefix=SCRATCH_DIR)
-        _WORKER_POOL.put((core_str, result_file, n))
-
-
-def _subprocess_main(seed=0, num_threads=1, sub_label="N/A", result_file=None, env=None):
-    import torch
-    from torch.utils.benchmark import Timer
-
-    conda_prefix = os.getenv("CONDA_PREFIX")
-    assert conda_prefix
-    if not torch.__file__.startswith(conda_prefix):
-        raise ValueError(
-            f"PyTorch mismatch: `import torch` resolved to `{torch.__file__}`, "
-            f"which is not in the correct conda env: {conda_prefix}"
-        )
-
-    torch.manual_seed(seed)
-    results = []
-    for n in [4, 8, 16, 32, 64, 128, 256, 512, 1024, 7, 96, 150, 225]:
-        dtypes = (("Single", torch.float32), ("Double", torch.float64))
-        shapes = (
-            # Square MatMul
-            ((n, n), (n, n), "(n x n) x (n x n)", "Matrix-Matrix Product"),
-
-            # Matrix-Vector product
-            ((n, n), (n, 1), "(n x n) x (n x 1)", "Matrix-Vector Product"),
-        )
-        for (dtype_name, dtype), (x_shape, y_shape, shape_str, blas_type) in it.product(dtypes, shapes):
-            t = Timer(
-                stmt="torch.mm(x, y)",
-                label=f"torch.mm {shape_str} {blas_type} ({dtype_name})",
-                sub_label=sub_label,
-                description=f"n = {n}",
-                env=os.path.split(env or "")[1] or None,
-                globals={
-                    "x": torch.rand(x_shape, dtype=dtype),
-                    "y": torch.rand(y_shape, dtype=dtype),
-                },
-                num_threads=num_threads,
-            ).blocked_autorange(min_run_time=MIN_RUN_TIME)
-            results.append(t)
-
-    if result_file is not None:
-        with open(result_file, "wb") as f:
-            pickle.dump(results, f)
-
-
-def run_subprocess(args):
-    seed, env, sub_label, extra_env_vars = args
-    core_str = None
-    try:
-        core_str, result_file, num_threads = _WORKER_POOL.get()
-        with open(result_file, "wb"):
-            pass
-
-        env_vars: Dict[str, str] = {
-            "PATH": os.getenv("PATH") or "",
-            "PYTHONPATH": os.getenv("PYTHONPATH") or "",
-
-            # NumPy
-            "OMP_NUM_THREADS": str(num_threads),
-            "MKL_NUM_THREADS": str(num_threads),
-            "NUMEXPR_NUM_THREADS": str(num_threads),
-        }
-        env_vars.update(extra_env_vars or {})
-
-        subprocess.run(
-            f"source activate {env} && "
-            f"taskset --cpu-list {core_str} "
-            f"python {os.path.abspath(__file__)} "
-            "--DETAIL-in-subprocess "
-            f"--DETAIL-seed {seed} "
-            f"--DETAIL-num-threads {num_threads} "
-            f"--DETAIL-sub-label '{sub_label}' "
-            f"--DETAIL-result-file {result_file} "
-            f"--DETAIL-env {env}",
-            env=env_vars,
-            stdout=subprocess.PIPE,
-            shell=True
-        )
-
-        with open(result_file, "rb") as f:
-            result_bytes = f.read()
-
-        with _RESULT_FILE_LOCK, \
-             open(RESULT_FILE, "ab") as f:
-            f.write(result_bytes)
-
-    except KeyboardInterrupt:
-        pass  # Handle ctrl-c gracefully.
-
-    finally:
-        if core_str is not None:
-            _WORKER_POOL.put((core_str, result_file, num_threads))
-
-
-def _compare_main():
-    results = []
-    with open(RESULT_FILE, "rb") as f:
-        while True:
-            try:
-                results.extend(pickle.load(f))
-            except EOFError:
-                break
-
-    from torch.utils.benchmark import Compare
-
-    comparison = Compare(results)
-    comparison.trim_significant_figures()
-    comparison.colorize()
-    comparison.print()
-
-
-def main():
-    with open(RESULT_FILE, "wb"):
-        pass
-
-    for num_threads in NUM_THREAD_SETTINGS:
-        fill_core_pool(num_threads)
-        workers = _WORKER_POOL.qsize()
-
-        trials = []
-        for seed in range(NUM_REPLICATES):
-            for sub_label, env, extra_env_vars in BLAS_CONFIGS:
-                env_path = os.path.join(blas_compare_setup.WORKING_ROOT, env)
-                trials.append((seed, env_path, sub_label, extra_env_vars))
-
-        n = len(trials)
-        with multiprocessing.dummy.Pool(workers) as pool:
-            start_time = time.time()
-            for i, r in enumerate(pool.imap(run_subprocess, trials)):
-                n_trials_done = i + 1
-                time_per_result = (time.time() - start_time) / n_trials_done
-                eta = int((n - n_trials_done) * time_per_result)
-                print(f"\r{i + 1} / {n}    ETA:{datetime.timedelta(seconds=eta)}".ljust(80), end="")
-                sys.stdout.flush()
-        print(f"\r{n} / {n}  Total time: {datetime.timedelta(seconds=int(time.time() - start_time))}")
-    print()
-
-    # Any env will do, it just needs to have torch for benchmark utils.
-    env_path = os.path.join(blas_compare_setup.WORKING_ROOT, BLAS_CONFIGS[0][1])
-    subprocess.run(
-        f"source activate {env_path} && "
-        f"python {os.path.abspath(__file__)} "
-        "--DETAIL-in-compare",
-        shell=True
-    )
-
-
-if __name__ == "__main__":
-    # These flags are for subprocess control, not controlling the main loop.
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--DETAIL-in-subprocess", "--DETAIL_in_subprocess", action="store_true")
-    parser.add_argument("--DETAIL-in-compare", "--DETAIL_in_compare", action="store_true")
-    parser.add_argument("--DETAIL-seed", "--DETAIL_seed", type=int, default=None)
-    parser.add_argument("--DETAIL-num-threads", "--DETAIL_num_threads", type=int, default=None)
-    parser.add_argument("--DETAIL-sub-label", "--DETAIL_sub_label", type=str, default="N/A")
-    parser.add_argument("--DETAIL-result-file", "--DETAIL_result_file", type=str, default=None)
-    parser.add_argument("--DETAIL-env", "--DETAIL_env", type=str, default=None)
-    args = parser.parse_args()
-
-    if args.DETAIL_in_subprocess:
-        try:
-            _subprocess_main(
-                args.DETAIL_seed,
-                args.DETAIL_num_threads,
-                args.DETAIL_sub_label,
-                args.DETAIL_result_file,
-                args.DETAIL_env,
-            )
-        except KeyboardInterrupt:
-            pass  # Handle ctrl-c gracefully.
-    elif args.DETAIL_in_compare:
-        _compare_main()
-    else:
-        main()
--- a/torch/utils/benchmark/examples/end_to_end.py
+++ b/torch/utils/benchmark/examples/end_to_end.py
@ -1,426 +0,0 @@
-"""End-to-end example to test a PR for regressions:
-
-$ python -m examples.end_to_end --pr 39850
-$ python -m examples.end_to_end --pr 39967
-$ python -m examples.end_to_end --pr 39744
-
-NOTE:
-  This example assumes that you have and environment prefixed with
-  `ref_`, and another prefixed with `pr_` for the PR
-  in question. (e.g. `ref_39850` and `pr_39850`).
-
-  A helper script (examples/prepare_e2e.sh) is provided to build
-  the required environments with the correct configuration.
-"""
-
-import argparse
-import itertools as it
-import multiprocessing
-import multiprocessing.dummy
-import os
-import pickle
-import queue
-import subprocess
-import tempfile
-import textwrap
-
-import numpy as np
-import torch
-from torch.utils.benchmark.op_fuzzers import unary
-from torch.utils.benchmark import Timer, Measurement
-from typing import Dict, Tuple, List
-
-
-_MAIN, _SUBPROCESS = "main", "subprocess"
-
-_PR_ENV_TEMPLATE = "pr_{pr}"
-_REF_ENV_TEMPLATE = "ref_{pr}"
-
-_PR_LIST = (
-    # Optimize topk performance for tensor with a large dimension size
-    "39850",
-
-    # Migrate `var` & `std` to ATen
-    "39967",
-
-    # Introducing (Const)StridedRandomAccessor + CompositeRandomAccessor + migrate `sort` to ATen (CPU)
-    "39744",
-)
-
-_CPU, _GPU = "cpu", "gpu"
-_MIN_RUN_SEC = 1
-_REPLICATES = {
-    _CPU: 5,  # CPU has a higher variance.
-    _GPU: 1,
-}
-_RUNS_PER_LOOP = 3
-_NUM_LOOPS = {
-    _CPU: 32,
-    _GPU: 64,
-}
-
-_DEVICES_TO_TEST = {
-    "39850": {_CPU: False, _GPU: True},
-    "39967": {_CPU: True, _GPU: True},
-    "39744": {_CPU: True, _GPU: True},
-}
-
-_AVAILABLE_GPUS = queue.Queue[int]()
-_DTYPES_TO_TEST = {
-    "39850": ("int8", "float32", "float64"),
-    "39967": ("float32", "float64"),
-    "39744": ("int8", "float32", "float64"),
-}
-_DTYPE_STR_TO_DTYPE = {
-    "float64": torch.float64,
-    "float32": torch.float32,
-    "int8": torch.int8,
-}
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pr", type=str, default=_PR_LIST[0], choices=_PR_LIST)
-    parser.add_argument("--num-gpus", "--num_gpus", type=int, default=None)
-    parser.add_argument("--test-variance", "--test_variance", action="store_true")
-
-    # (Implementation details)
-    parser.add_argument("--DETAIL-context", "--DETAIL_context", type=str, choices=(_MAIN, _SUBPROCESS), default=_MAIN)
-    parser.add_argument("--DETAIL-device", "--DETAIL_device", type=str, choices=(_CPU, _GPU), default=None)
-    parser.add_argument("--DETAIL-env", "--DETAIL_env", type=str, default=None)
-    parser.add_argument("--DETAIL-result-file", "--DETAIL_result_file", type=str, default=None)
-    parser.add_argument("--DETAIL-seed", "--DETAIL_seed", type=int, default=None)
-
-    args = parser.parse_args()
-    if args.num_gpus is None:
-        args.num_gpus = torch.cuda.device_count()
-    return args
-
-
-_SUBPROCESS_CMD_TEMPLATE = (
-    "source activate {source_env} && python -m examples.end_to_end "
-    "--pr {pr} "
-    "--DETAIL-context subprocess "
-    "--DETAIL-device {device} "
-    "--DETAIL-env {env} "
-    "--DETAIL-result-file {result_file} "
-    "--DETAIL-seed {seed}"
-)
-
-
-def construct_stmt_and_label(pr, params):
-    if pr == "39850":
-        k0, k1, k2, dim = (params[i] for i in ["k0", "k1", "k2", "dim"])
-        state = np.random.RandomState(params["random_value"])
-        topk_dim = state.randint(low=0, high=dim)
-        dim_size = [k0, k1, k2][topk_dim]
-        k = max(int(np.floor(2 ** state.uniform(low=0, high=np.log2(dim_size)))), 1)
-
-        return f"torch.topk(x, dim={topk_dim}, k={k})", "topk"
-
-    if pr == "39967":
-        return "torch.std(x)", "std"
-
-    if pr == "39744":
-        state = np.random.RandomState(params["random_value"])
-        sort_dim = state.randint(low=0, high=params["dim"])
-        return f"torch.sort(x, dim={sort_dim})", "sort"
-
-    raise ValueError("Unknown PR")
-
-
-def subprocess_main(args):
-    seed = args.DETAIL_seed
-    cuda = (args.DETAIL_device == _GPU)
-
-    with open(args.DETAIL_result_file, "ab") as f:
-        for dtype_str in _DTYPES_TO_TEST[args.pr]:
-            dtype = _DTYPE_STR_TO_DTYPE[dtype_str]
-            iterator = unary.UnaryOpFuzzer(
-                seed=seed, dtype=dtype, cuda=cuda).take(_RUNS_PER_LOOP)
-            for i, (tensors, tensor_parameters, params) in enumerate(iterator):
-                params["dtype_str"] = dtype_str
-                stmt, label = construct_stmt_and_label(args.pr, params)
-                timer = Timer(
-                    stmt=stmt,
-                    globals=tensors,
-                    label=label,
-                    description=f"[{i}, seed={seed}] ({dtype_str}), stmt = {stmt}",
-                    env=args.DETAIL_env,
-                )
-
-                measurement = timer.blocked_autorange(min_run_time=_MIN_RUN_SEC)
-                measurement.metadata = {
-                    "tensor_parameters": tensor_parameters,
-                    "params": params,
-                }
-                print(measurement)
-                pickle.dump(measurement, f)
-
-
-def _main(args):
-    pools, map_iters, finished_counts = {}, {}, {}
-    pr = args.pr
-    envs = (_REF_ENV_TEMPLATE.format(pr=pr), _PR_ENV_TEMPLATE.format(pr=pr))
-
-    # We initialize both pools at the start so that they run simultaneously
-    # if applicable
-    if _DEVICES_TO_TEST[args.pr][_GPU]:
-        finished_counts[_GPU] = 0
-        for i in range(args.num_gpus):
-            _AVAILABLE_GPUS.put(i)
-
-        pools[_GPU] = multiprocessing.dummy.Pool(args.num_gpus)
-        trials = [
-            (seed, envs, pr, True, finished_counts, args.test_variance)
-            for seed in range(_NUM_LOOPS[_GPU])] * _REPLICATES[_GPU]
-        map_iters[_GPU] = pools[_GPU].imap(map_fn, trials)
-
-    if _DEVICES_TO_TEST[args.pr][_CPU]:
-        finished_counts[_CPU] = 0
-        cpu_workers = int(multiprocessing.cpu_count() / 3)
-        pools[_CPU] = multiprocessing.dummy.Pool(cpu_workers)
-        trials = [
-            (seed, envs, pr, False, finished_counts, args.test_variance)
-            for seed in range(_NUM_LOOPS[_CPU])] * _REPLICATES[_CPU]
-        map_iters[_CPU] = pools[_CPU].imap(map_fn, trials)
-
-    results = []
-    for map_iter in map_iters.values():
-        for r in map_iter:
-            results.append(r)
-            progress = [
-                f"{k}: {v} / {_NUM_LOOPS[k] * _REPLICATES[k]}"
-                for k, v in finished_counts.items()]
-            print(f"\r{(' ' * 10).join(progress)}", end="")
-    print()
-
-    for pool in pools.values():
-        pool.close()
-
-    process_results(results, args.test_variance)
-
-
-# \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
-# == Data processing and string formatting ====================================
-# /////////////////////////////////////////////////////////////////////////////
-def merge(measurements):
-    if not measurements:
-        return None
-
-    states = [m.__getstate__() for m in measurements]
-    for k in states[0].keys():
-        if k in ("number_per_run", "times", "metadata"):
-            continue
-
-        assert all(s[k] == states[0][k] for s in states)
-
-    numbers_per_run = {m.number_per_run for m in measurements}
-    n = numbers_per_run.pop() if len(numbers_per_run) == 1 else 1
-
-    merged_state = states[0]
-    times = [[t / m.number_per_run * n for t in m.times] for m in measurements]
-    merged_state["times"] = list(it.chain(*times))
-    merged_state["number_per_run"] = n
-    merged_state["metadata"] = states[0]["metadata"]
-    return Measurement(**merged_state)
-
-
-def process_results(results, test_variance):
-    paired_results: Dict[Tuple[str, str, int, bool, int], List] = {}
-    for (seed, use_gpu), result_batch in results:
-        for r in result_batch:
-            key = (r.label, r.description, r.num_threads, use_gpu, seed)
-            paired_results.setdefault(key, [[], []])
-            index = 0 if r.env.startswith("ref") else 1
-            paired_results[key][index].append(r)
-
-    paired_results = {
-        key: [merge(r_ref_list), merge(r_pr_list)]
-        for key, (r_ref_list, r_pr_list) in paired_results.items()
-    }
-
-    flagged_for_removal = set()
-    for key, (r_ref, r_pr) in paired_results.items():
-        if any(r is None or r.has_warnings for r in (r_ref, r_pr)):
-            flagged_for_removal.add(key)
-
-    paired_results = {
-        k: v for k, v in paired_results.items()
-        if k not in flagged_for_removal
-    }
-    print(f"{len(flagged_for_removal)} samples were culled, {len(paired_results)} remain")
-
-    gpu_results = [(k, v) for k, v in paired_results.items() if k[3]]
-    cpu_results = [(k, v) for k, v in paired_results.items() if not k[3]]
-
-    if cpu_results:
-        construct_table(cpu_results, "CPU", test_variance)
-
-    if gpu_results:
-        construct_table(gpu_results, "GPU", test_variance)
-
-
-def construct_table(results, device_str, test_variance):
-    device_str = f"== {device_str} {' (Variance Test)' if test_variance else ''}  ".ljust(40, "=")
-    print(f"{'=' * 40}\n{device_str}\n{'=' * 40}\n")
-    results = sorted((
-        (key, (r_ref, r_pr), r_pr.median / r_ref.median - 1)
-        for key, (r_ref, r_pr) in results
-    ), key=lambda i: i[2])
-
-    n = len(results)
-    n_regressed = len([i for i in results if i[2] > 0.05])
-    n_improved = len([i for i in results if i[2] < -0.05])
-    n_unchanged = n - n_improved - n_regressed
-    legends = ["Improved  (>5%):", "Regressed (>5%):", "Within 5%:"]
-    for legend, count in zip(legends, [n_improved, n_regressed, n_unchanged]):
-        print(f"{legend:<17} {count:>6}  ({count / len(results) * 100:>3.0f}%)")
-
-    keys_to_print = (
-        {i[0] for i in results[20:30]} |
-        {i[0] for i in results[int(n // 2 - 5):int(n // 2 + 5)]} |
-        {i[0] for i in results[-30:-20]}
-    )
-    ellipsis_after = {results[29][0], results[int(n // 2 + 4)][0]}
-
-    column_labels = (
-        f"Relative Δ     Absolute Δ      |      numel{'':>8}dtype{'':>14}"
-        f"shape{'':>10}steps{'':>10}layout{'':>7}task specific\n{'=' * 126}"
-    )
-
-    _, result_log_file = tempfile.mkstemp(suffix=".log")
-    with open(result_log_file, "w") as f:
-        f.write(f"{device_str}\n\n{column_labels}\n")
-        print(f"\n{column_labels}\n[First twenty omitted (these tend to be noisy) ]")
-        for key, (r_ref, r_pr), rel_diff in results:
-            row = row_str(rel_diff, r_pr.median - r_ref.median, r_ref)
-            f.write(f"{row}\n")
-            if key in keys_to_print:
-                print(row)
-            if key in ellipsis_after:
-                print("...")
-        print("[Last twenty omitted (these tend to be noisy) ]")
-
-    print(textwrap.dedent("""
-        steps:
-            Indicates that `x` is sliced from a larger Tensor. For instance, if
-            shape is [12, 4] and steps are [2, 1], then a larger Tensor of size
-            [24, 4] was created, and then x = base_tensor[::2, ::1]. Omitted if
-            all elements are ones.
-
-        layout:
-            Indicates that `x` is not contiguous due to permutation. Invoking
-            `x.permute(layout)` (e.g. x.permute((2, 0, 1)) if layout = [2, 0, 1])
-            would produce a Tensor with physical memory layout matching logical
-            memory layout. (Though still not contiguous if `steps` contains
-            non-one elements.)
-        """))
-
-    print(f"\nComplete results in: {result_log_file}")
-
-
-def row_str(rel_diff, diff_seconds, measurement):
-    params = measurement.metadata["params"]
-    tensor_parameters = measurement.metadata["tensor_parameters"]
-
-    dim = params["dim"]
-    x_numel = tensor_parameters["x"]["numel"]
-    steps = [params[f"x_step_{i}"] for i in range(dim)]
-    order = tensor_parameters['x']["order"]
-    order = str("" if all(i == j for i, j in zip(order, range(dim))) else order)
-
-    task_specific = ""
-    if measurement.stmt.startswith("torch.topk"):
-        dim_str, k_str = measurement.stmt[:-1].replace("torch.topk(x, ", "").split(", ")
-        task_specific = f"{dim_str}, {k_str:<8}"
-    elif measurement.stmt.startswith("torch.std"):
-        pass
-    elif measurement.stmt.startswith("torch.sort"):
-        task_specific = measurement.stmt[:-1].replace("torch.sort(x, ", "")
-
-    return (
-        f"{rel_diff * 100:>5.0f}%     {abs(diff_seconds) * 1e6:>11.1f} us{'':>6}|"
-        f"{x_numel:>12}   {params['dtype_str']:>10}   "
-        f"{str([params[f'k{i}'] for i in range(dim)]):>17}  "
-        f"{str(steps) if not all(i == 1 for i in steps) else '':>12}  {order:>12}"
-        f"{'':>8}{task_specific}"
-    )
-
-
-# \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
-# == Subprocess and environment management ====================================
-# /////////////////////////////////////////////////////////////////////////////
-def read_results(result_file: str):
-    output = []
-    with open(result_file, "rb") as f:
-        while True:
-            try:
-                output.append(pickle.load(f))
-            except EOFError:
-                break
-    return output
-
-
-def run(cmd, cuda_visible_devices=""):
-    return subprocess.run(
-        cmd,
-        env={
-            "CUDA_VISIBLE_DEVICES": str(cuda_visible_devices),
-            "PATH": os.getenv("PATH", ""),
-        },
-        stdout=subprocess.PIPE,
-        shell=True
-    )
-
-
-def test_source(envs):
-    """Ensure that subprocess"""
-    for env in envs:
-        result = run(f"source activate {env}")
-        if result.returncode != 0:
-            raise ValueError(f"Failed to source environment `{env}`")
-
-
-def map_fn(args):
-    seed, envs, pr, use_gpu, finished_counts, test_variance = args
-    gpu = _AVAILABLE_GPUS.get() if use_gpu else None
-    try:
-        _, result_file = tempfile.mkstemp(suffix=".pkl")
-        for env in envs:
-            cmd = _SUBPROCESS_CMD_TEMPLATE.format(
-                source_env=envs[0] if test_variance else env,
-                env=env, pr=pr, device=_GPU if use_gpu else _CPU,
-                result_file=result_file, seed=seed,
-            )
-            run(cmd=cmd, cuda_visible_devices=gpu if use_gpu else "")
-        finished_counts[_GPU if use_gpu else _CPU] += 1
-        return (seed, use_gpu), read_results(result_file)
-    except KeyboardInterrupt:
-        pass  # Handle ctrl-c gracefully.
-    finally:
-        if gpu is not None:
-            _AVAILABLE_GPUS.put(gpu)
-        if os.path.exists(result_file):
-            os.remove(result_file)
-
-
-def main(args):
-    test_source([
-        _REF_ENV_TEMPLATE.format(pr=args.pr),
-        _PR_ENV_TEMPLATE.format(pr=args.pr),
-    ])
-    _main(args)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    if args.DETAIL_context == "main":
-        main(args)
-
-    if args.DETAIL_context == "subprocess":
-        try:
-            subprocess_main(args)
-        except KeyboardInterrupt:
-            pass  # Handle ctrl-c gracefully.