Enable Doc builds for: Minor Releases RCs. Minor and Patch Releases final RC (#167494 )

Enable Doc builds for: Minor Releases RCs. Minor and Patch Releases final RC (#167478) Enable Doc builds for 1. Minor Releases RCs 2. Minor and Patch Releases final RC This is done to prevent publishing doc for patch releases when building rcs. See: https://github.com/pytorch/docs/pull/57 Followup after: https://github.com/pytorch/pytorch/pull/153973 Pull Request resolved: https://github.com/pytorch/pytorch/pull/167478 Approved by: https://github.com/svekars, https://github.com/seemethere (cherry picked from commit f6331192b4b105bb8a20823dc02e33b55e5c91e2) Co-authored-by: atalman <atalman@fb.com>
Add doc for Symmetric Memory (#167477 )
2025-11-14 22:25:03 +08:00 · 2025-11-10 20:02:50 -05:00 · 2025-11-10 15:18:23 -05:00 · 2025-11-07 17:04:27 -05:00 · 2025-11-07 17:03:33 -05:00 · 2025-11-07 16:58:33 -05:00
13 changed files with 441 additions and 19 deletions
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -5,9 +5,11 @@ on:
    - cron: 0 0 * * *
  push:
    tags:
-      # NOTE: Doc build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      # NOTE: Doc build pipelines should only get triggered on:
+      # Major or minor release candidates builds
+      - v[0-9]+.[0-9]+.0+-rc[0-9]+
+      # Final RC for major, minor and patch releases
+      - v[0-9]+.[0-9]+.[0-9]+
      - ciflow/nightly/*
  workflow_dispatch:

--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -155,6 +155,12 @@ class TORCH_API Context {
  static long versionCuDNN() {
    return detail::getCUDAHooks().versionCuDNN();
  }
+  static long versionRuntimeCuDNN() {
+    return detail::getCUDAHooks().versionRuntimeCuDNN();
+  }
+  static long versionCuDNNFrontend() {
+    return detail::getCUDAHooks().versionCuDNNFrontend();
+  }
  static bool hasCuSOLVER() {
    return detail::getCUDAHooks().hasCuSOLVER();
  }
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -21,6 +21,7 @@

 #if AT_CUDNN_ENABLED()
 #include <ATen/cudnn/cudnn-wrapper.h>
+#include <cudnn_frontend.h>
 #endif

 #if AT_MAGMA_ENABLED()
@ -325,6 +326,26 @@ long CUDAHooks::versionCuDNN() const {
 #endif
 }

+long CUDAHooks::versionRuntimeCuDNN() const {
+#if AT_CUDNN_ENABLED()
+#ifndef USE_STATIC_CUDNN
+  return cudnnGetVersion();
+#else
+  return CUDNN_VERSION;
+#endif
+#else
+  TORCH_CHECK(false, "Cannot query CuDNN version if ATen_cuda is not built with CuDNN");
+#endif
+}
+
+long CUDAHooks::versionCuDNNFrontend() const {
+#if AT_CUDNN_ENABLED()
+  return CUDNN_FRONTEND_VERSION;
+#else
+  TORCH_CHECK(false, "Cannot query CuDNN Frontend version if ATen_cuda is not built with CuDNN");
+#endif
+}
+
 long CUDAHooks::versionMIOpen() const {
 #if AT_ROCM_ENABLED()
  return MIOPEN_VERSION_MAJOR * 10000 +
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -48,6 +48,8 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  bool hasCUDART() const override;
  long versionCUDART() const override;
  long versionCuDNN() const override;
+  long versionRuntimeCuDNN() const override;
+  long versionCuDNNFrontend() const override;
  long versionMIOpen() const override;
  std::string showConfig() const override;
  double batchnormMinEpsilonCuDNN() const override;
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -170,6 +170,14 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
  }

+  virtual long versionRuntimeCuDNN() const {
+    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual long versionCuDNNFrontend() const {
+    TORCH_CHECK(false, "Cannot query cuDNN Frontend version without ATen_cuda library. ", CUDA_HELP);
+  }
+
  virtual long versionMIOpen() const {
    TORCH_CHECK(false, "Cannot query MIOpen version without ATen_cuda library. ", CUDA_HELP);
  }
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -413,7 +413,7 @@ struct ConvParams {
    if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
      return false;
    }
-    static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
+    static long cudnn_version = detail::getCUDAHooks().versionRuntimeCuDNN();
    // broken on cuDNN 9.8 - 9.14
    if (cudnn_version >= 90800 && cudnn_version < 91500) {
      if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
@ -457,7 +457,7 @@ struct ConvParams {
    }
    // native kernel doesn't support 64-bit non-splittable case
    if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
-      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
+      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionRuntimeCuDNN() : -1;
      // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
      if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
        if (cudnn_version < 0 || cudnn_version > 91000) {
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -437,7 +437,7 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
  const auto s_k = params.key.sym_size(2);
  const auto d_qk = params.query.sym_size(3);
  const auto d_v = params.value.sym_size(3);
-  long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
+  long cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
  if (cudnn_version < 8903) {
    if (debug) {
      TORCH_WARN("SDPA fprop requires cudnn 8.9.3 or higher");
@ -668,7 +668,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
  return false;
 #endif
 #if defined(CUDNN_VERSION)
-  static auto cudnn_version = cudnnGetVersion();
+  static auto cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
  if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) {
    if (debug) {
      TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13).");
--- a/docs/source/pytorch-api.md
+++ b/docs/source/pytorch-api.md
@ -41,6 +41,7 @@ torch.distributed.fsdp.fully_shard <distributed.fsdp.fully_shard>
 torch.distributed.tensor.parallel <distributed.tensor.parallel>
 torch.distributed.optim <distributed.optim>
 torch.distributed.pipelining <distributed.pipelining>
+torch.distributed._symmetric_memory <symmetric_memory>
 torch.distributed.checkpoint <distributed.checkpoint>
 torch.distributions <distributions>
 torch.compiler <torch.compiler>
--- a/docs/source/symmetric_memory.md
+++ b/docs/source/symmetric_memory.md
@ -0,0 +1,380 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# PyTorch Symmetric Memory
+
+:::{note}
+`torch.distributed._symmetric_memory` is currently in alpha state and under
+development. API changes may be possible.
+:::
+
+## Why Symmetric Memory?
+
+With rapidly evolving parallelization techniques, existing frameworks and
+libraries often struggle to keep up, and developers increasingly rely on custom
+implementations directly scheduling communications and computations. In recent
+years we’ve witnessed a shift from primarily relying on one-dimensional
+data-parallelism techniques to multi-dimensional parallelism ones. The latter
+have different latency requirements for different types of communications and
+thus require fine-grained overlapping of compute and communications.
+
+To minimize compute interference, they also require the use of copy engines and
+network interface cards (NICs) to drive communication. Network transport
+protocols such as remote direct memory access (RDMA) enhance the performance by
+enabling direct, high-speed, and low-latency communication between processors
+and memory. This increase in variety indicates the need for finer-grained
+communication primitives than are offered today by high-level collective APIs,
+ones that would enable developers to implement specific algorithms tailored for
+their use cases, such as low-latency collectives, fine-grained
+compute-communications overlap, or custom fusions.
+
+Furthermore, today’s advanced AI systems connect GPUs with high-bandwidth links
+(such as NVLinks, InfiniBand or RoCE), making GPU global memory directly
+accessible to peers. Such connections present a great opportunity for
+programmers to program the system as a single, gigantic GPU with vast accessible
+memory, instead of programming singular “GPU islands.”
+
+In this document, we will show how you can use PyTorch Symmetric Memory to
+program modern GPU systems as a “single GPU” and achieve fine-grained remote
+access.
+
+## What PyTorch Symmetric Memory unlocks?
+
+PyTorch Symmetric Memory unlocks three new capabilities:
+
+- **Customized communication patterns**: Increased flexibility in kernel writing
+allows developers to write custom kernels that implement their custom
+computations and communications, directly tailored to the need of the
+application. It will also be straightforward to add support for new data types
+along with the special compute that those data types might require, even if it’s
+not present yet in the standard libraries.
+
+- **In-kernel compute-comm fusion**: Device-initiated communication capability
+allows developers to write kernels with both computation and communication
+instructions, allowing for the fusion of computation and data movement in the
+smallest possible granularity.
+
+- **Low-latency remote access**: Network transport protocols like RDMA enhance the
+performance of symmetric memory in networked environments by enabling direct,
+high-speed, and low-latency communication between processors and memory. RDMA
+eliminates the overhead associated with the traditional network stack and CPU
+involvement. It also offloads data transfer from the compute to the NICs,
+freeing up compute resources for computational tasks.
+
+Next, we will show you how PyTorch Symmetric Memory (SymmMem) enables new
+applications with the above capabilities.
+
+## A “Hello World” example
+
+The PyTorch SymmMem programming model involves two key elements:
+
+- creating symmetric tensors
+- creating SymmMem kernels
+
+To create symmetric tensors, one can use the
+`torch.distributed._symmetric_memory` package:
+
+```python
+import torch.distributed._symmetric_memory as symm_mem
+
+t = symm_mem.empty(128, device=torch.device("cuda", rank))
+hdl = symm_mem.rendezvous(t, group)
+```
+
+The `symm_mem.empty` function creates a tensor that is backed by a symmetric
+memory allocation. The `rendezvous` function establishes a rendezvous with peers
+in the group, and returns a handle to the symmetric memory allocation. The
+handle provides method to access information related to the symmetric memory
+allocation, such as pointers to symmetric buffer on peer ranks, multicast
+pointer (if supported), and signal pads.
+
+The `empty` and `rendezvous` functions must be called in the same order on all
+ranks in the group.
+
+Then, collectives can be called on these tensors. For example, to perform a
+one-shot all-reduce:
+
+```python
+# Most SymmMem ops are under the torch.ops.symm_mem namespace
+torch.ops.symm_mem.one_shot_all_reduce(t, "sum", group)
+```
+
+Please note that `torch.ops.symm_mem` is an "op namespace" instead of a python
+module. Therefore, you can't import it by `import torch.ops.symm_mem`, neither
+can you import an op by `from torch.ops.symm_mem import one_shot_all_reduce`.
+You can call the op directly as in the example above.
+
+## Write your own kernel
+
+To write your own kernel doing communications with symmetric memory, you’ll need
+access to the addresses of mapped peer buffers and access to signal pads that
+are required for synchronization. In the kernel you’ll also need to perform
+correct synchronizations to make sure that peers are ready for communication,
+and signal to them that this GPU is ready.
+
+PyTorch Symmetric Memory provides CUDA Graph-compatible synchronization
+primitives that operate on the signal pad accompanying each symmetric memory
+allocation. Kernels using symmetric memory can be written both in CUDA and in
+Triton. Here’s an example allocating symmetric tensor and exchanging handles:
+
+```python
+import torch.distributed._symmetric_memory as symm_mem
+
+dist.init_process_group()
+rank = dist.get_rank()
+
+# Allocate a tensor
+t = symm_mem.empty(4096, device=f"cuda:{rank}")
+# Establish symmetric memory and obtain the handle
+hdl = symm_mem.rendezvous(t, dist.group.WORLD)
+```
+
+Access to buffer pointers, multimem pointer, and signal pads is provided via:
+
+```python
+hdl.buffer_ptrs
+hdl.multicast_ptr
+hdl.signal_pad_ptrs
+```
+
+Data pointed to by `buffer_ptrs` can be accessed just like regular local data,
+and any necessary compute can also be performed in the usual ways. As with local
+data, you can and should use vectorized accesses to improve efficiency.
+
+Symmetric memory is especially convenient for writing kernels in Triton. While
+previously Triton removed the barriers to writing efficient CUDA code, now
+communications can be added easily to Triton kernels. The kernel below
+demonstrates a low-latency, all-reduce kernel written in Triton.
+
+```python
+@triton.jit
+def one_shot_all_reduce_kernel(
+    buf_tuple,
+    signal_pad_ptrs,
+    output_ptr,
+    numel: tl.constexpr,
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    ptx_utils.symm_mem_sync(
+        signal_pad_ptrs, None, rank, world_size, hasSubsequenceMemAccess=True
+    )
+
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+
+    while block_start < numel:
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < numel
+        acc = tl.zeros((BLOCK_SIZE,), dtype=tl.bfloat16)
+
+        for i in tl.static_range(world_size):
+            buffer_rank = buf_tuple[i]
+            x = tl.load(buffer_rank + offsets, mask=mask)
+            acc += x
+
+        tl.store(output_ptr + offsets, acc, mask=mask)
+        block_start += tl.num_programs(axis=0) * BLOCK_SIZE
+
+    ptx_utils.symm_mem_sync(
+        signal_pad_ptrs, None, rank, world_size, hasPreviousMemAccess=True
+    )
+```
+
+Synchronizations at the beginning and the end of the kernel above guarantee that
+all the processes see consistent data. The bulk of the kernel is recognizable
+Triton code, and Triton will optimize it behind the scene, making sure memory
+accesses are performed in an efficient way with vectorization and unrolling. As
+with all Triton kernels, it is easily modifiable to add extra computations or
+change the communication algorithm. Visit
+https://github.com/meta-pytorch/kraken/blob/main/kraken to see additional
+utilities and examples of using symmetric memory to implement common patterns in
+Triton.
+
+## Scale out
+
+Large language models distribute experts onto more than 8 GPUs, hence requiring
+multi-node access capability. NICs capable of RDMA come to help. In addition,
+software libraries such as NVSHMEM or rocSHMEM abstract away the programming
+difference between intra-node access and inter-node access with primitives that
+are slightly higher level than pointer access, such as put and get.
+
+PyTorch provides NVSHMEM plugins to augment Triton kernels’ cross-node
+capabilities. As shown in the code snippet below, one can initiate a cross-node
+put command within the kernel.
+
+```python
+import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem
+from torch.distributed._symmetric_memory._nvshmem_triton import requires_nvshmem
+
+@requires_nvshmem
+@triton.jit
+def my_put_kernel(
+    dest,
+    src,
+    nelems,
+    pe,
+):
+    nvshmem.put(dest, src, nelems, pe)
+```
+
+The `requires_nvshmem` decorator is used to indicate that the kernel requires
+the NVSHMEM device library as an external dependency. When Triton compiles the
+kernel, the decorator will search your system paths for the NVSHMEM device
+library. If it is available, Triton will include the necessary device assembly
+to use the NVSHMEM functions.
+
+## API Reference
+
+```{eval-rst}
+.. currentmodule:: torch.distributed._symmetric_memory
+```
+
+```{eval-rst}
+.. autofunction:: empty
+```
+
+```{eval-rst}
+.. autofunction:: rendezvous
+```
+
+```{eval-rst}
+.. autofunction:: is_nvshmem_available
+```
+
+```{eval-rst}
+.. autofunction:: set_backend
+```
+
+```{eval-rst}
+.. autofunction:: get_backend
+```
+
+## Op Reference
+:::{note}
+The following ops are hosted in the `torch.ops.symm_mem` namespace. You can call
+them directly via `torch.ops.symm_mem.<op_name>`.
+:::
+
+```{eval-rst}
+.. currentmodule:: torch.ops.symm_mem
+```
+
+```{eval-rst}
+.. py:function:: multimem_all_reduce_(input: Tensor, reduce_op: str, group_name: str) -> Tensor
+
+    Performs a multimem all-reduce operation on the input tensor. This operation
+    requires hardware support for multimem operations. On NVIDIA GPUs, NVLink
+    SHARP is required.
+
+    :param Tensor input: Input tensor to perform all-reduce on. Must be symmetric.
+    :param str reduce_op: Reduction operation to perform. Currently only "sum" is supported.
+    :param str group_name: Name of the group to perform all-reduce on.
+
+
+.. py:function:: multimem_all_gather_out(input: Tensor, group_name: str, out: Tensor) -> Tensor
+
+    Performs a multimem all-gather operation on the input tensor. This operation requires hardware support for multimem operations. On NVIDIA GPUs, NVLink SHARP is required.
+
+    :param Tensor input: Input tensor to perform all-gather on.
+    :param str group_name: Name of the group to perform all-gather on.
+    :param Tensor out: Output tensor to store the result of the all-gather operation. Must be symmetric.
+
+
+.. py:function:: one_shot_all_reduce(input: Tensor, reduce_op: str, group_name: str) -> Tensor
+
+    Performs a one-shot all-reduce operation on the input tensor.
+
+    :param Tensor input: Input tensor to perform all-reduce on. Must be symmetric.
+    :param str reduce_op: Reduction operation to perform. Currently only "sum" is supported.
+    :param str group_name: Name of the group to perform all-reduce on.
+
+
+.. py:function:: one_shot_all_reduce_out(input: Tensor, reduce_op: str, group_name: str, out: Tensor) -> Tensor
+
+    Performs a one-shot all-reduce operation based on the input tensor and writes the result to the output tensor.
+
+    :param Tensor input: Input tensor to perform all-reduce on. Must be symmetric.
+    :param str reduce_op: Reduction operation to perform. Currently only "sum" is supported.
+    :param str group_name: Name of the group to perform all-reduce on.
+    :param Tensor out: Output tensor to store the result of the all-reduce operation. Can be a regular tensor.
+
+
+.. py:function:: two_shot_all_reduce_(input: Tensor, reduce_op: str, group_name: str) -> Tensor
+
+    Performs a two-shot all-reduce operation on the input tensor.
+
+    :param Tensor input: Input tensor to perform all-reduce on. Must be symmetric.
+    :param str reduce_op: Reduction operation to perform. Currently only "sum" is supported.
+    :param str group_name: Name of the group to perform all-reduce on.
+
+
+.. py:function:: all_to_all_vdev(input: Tensor, out: Tensor, in_splits: Tensor, out_splits_offsets: Tensor, group_name: str) -> None
+
+    Performs an all-to-all-v operation using NVSHMEM, with split information provided on device.
+
+    :param Tensor input: Input tensor to perform all-to-all on. Must be symmetric.
+    :param Tensor out: Output tensor to store the result of the all-to-all operation. Must be symmetric.
+    :param Tensor in_splits: Tensor containing splits of data to send to each peer. Must be symmetric. Must be of size (group_size,). The splits are in the unit of elements in the 1st dimension.
+    :param Tensor out_splits_offsets: Tensor containing the splits and offsets of data received from each peer. Must be symmetric. Must be of size (2, group_size). The rows are (in order): output splits and output offsets.
+    :param str group_name: Name of the group to perform all-to-all on.
+
+
+.. py:function:: all_to_all_vdev_2d(input: Tensor, out: Tensor, in_splits: Tensor, out_splits_offsets: Tensor, group_name: str, [major_align: int = None]) -> None
+
+    Perform a 2D all-to-all-v operation using NVSHMEM, with split information provided on device. In Mixture of Experts models, this operation can be used to dispatch tokens.
+
+    :param Tensor input: Input tensor to perform all-to-all on. Must be symmetric.
+    :param Tensor out: Output tensor to store the result of the all-to-all operation. Must be symmetric.
+    :param Tensor in_splits: Tensor containing the splits of data to send to each expert. Must be symmetric. Must be of size (group_size * ne,), where ne is the number of experts per rank. The splits are in the unit of elements in the 1st dimension.
+    :param Tensor out_splits_offsets: Tensor containing the splits and offsets of data received from each peer. Must be symmetric. Must be of size (2, group_size * ne). The rows are (in order): output splits and output offsets.
+    :param str group_name: Name of the group to perform all-to-all on.
+    :param int major_align: Optional alignment for the major dimension of the output chunk for each expert. If not provided, the alignment is assumed to be 1. Any alignment adjustment will be reflected in the output offsets.
+
+    A 2D AllToAllv shuffle is illustrated below:
+    (world_size = 2, ne = 2, total number of experts = 4)::
+
+      Source: |       Rank 0      |       Rank 1      |
+              | c0 | c1 | c2 | c3 | d0 | d1 | d2 | d3 |
+
+      Dest  : |       Rank 0      |       Rank 1      |
+              | c0 | d0 | c1 | d1 | c2 | d2 | c3 | d3 |
+
+    where each `c_i` / `d_i` are slices of the `input` tensor, targeting expert
+    `i`, with length indicated by input splits.  That is, the 2D AllToAllv
+    shuffle achieves a transpose from rank-major order at input to expert-major
+    order at output.
+
+    If `major_align` is not 1, the output offsets of c1, c2, c3 will be
+    up-aligned to this value. For example, if c0 has length 5 and d0 has
+    length 7 (making a total of 12), and if the `major_align` is set to 16,
+    the output offset of c1 will be 16. Similar for c2 and c3. This value has
+    no effect on the offset of the minor dimension, i.e.  d0, d1, d2 and d3.
+    Note: since cutlass does not support empty bins, we set the aligned length
+    to `major_align` if it is 0. See
+    https://github.com/pytorch/pytorch/issues/152668.
+
+
+.. py:function:: all_to_all_vdev_2d_offset(Tensor input, Tensor out, Tensor in_splits_offsets, Tensor out_splits_offsets, str group_name) -> None
+
+    Perform a 2D AllToAllv shuffle operation, with input split and offset
+    information provided on device. The input offsets are not required to be
+    exact prefix sum of the input splits, i.e. paddings are allowed between the
+    split chunks. The paddings, however, will not be transferred to peer
+    ranks.
+
+    In Mixture of Experts models, this operation can be used to combine tokens
+    processed by experts on parallel ranks. This operation can be viewed as an
+    "reverse" operation to the `all_to_all_vdev_2d` operation (which shuffles
+    tokens to experts).
+
+    :param Tensor input: Input tensor to perform all-to-all on. Must be symmetric.
+    :param Tensor out: Output tensor to store the result of the all-to-all operation. Must be symmetric.
+    :param Tensor in_splits_offsets: Tensor containing the splits and offsets of data to send to each expert. Must be symmetric. Must be of size (2, group_size * ne), where `ne` is the number of experts. The rows are (in order): input splits and input offsets. The splits are in the unit of elements in the 1st dimension.
+    :param Tensor out_splits_offsets: Tensor containing the splits and offsets of data received from each peer. Must be symmetric. Must be of size (2, group_size * ne). The rows are (in order): output splits and output offsets.
+    :param str group_name: Name of the group to perform all-to-all on.
+
+```
--- a/test/test_multiprocessing_spawn.py
+++ b/test/test_multiprocessing_spawn.py
@ -265,6 +265,12 @@ class ParallelForkServerShouldWorkTest(TestCase, _TestMultiProcessing):
 )
 class ParallelForkServerPerfTest(TestCase):

+    @unittest.skipIf(
+        sys.version_info >= (3, 13, 8),
+        "Python 3.13.8+ changed forkserver module caching behavior",
+        # https://docs.python.org/3.13/whatsnew/changelog.html
+        # gh-126631
+    )
    def test_forkserver_perf(self):

        start_method = 'forkserver'
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@ -1429,6 +1429,7 @@ def _compile(
        fail_user_frame_lineno: Optional[int] = None
        torch._dynamo.utils.ReinplaceCounters.clear()
        guarded_code = None
+        tracer_output = None
        try:
            guarded_code, tracer_output = compile_inner(code, one_graph, hooks)

--- a/torch/csrc/cuda/shared/cudnn.cpp
+++ b/torch/csrc/cuda/shared/cudnn.cpp
@ -2,6 +2,7 @@
 // This file should only be compiled if this condition holds, so it should be
 // safe.
 #if defined(USE_CUDNN) || defined(USE_ROCM)
+#include <ATen/detail/CUDAHooksInterface.h>
 #include <torch/csrc/utils/pybind.h>

 #include <tuple>
@ -32,11 +33,7 @@ version_tuple getRuntimeVersion() {
 }

 size_t getVersionInt() {
-#ifndef USE_STATIC_CUDNN
-  return cudnnGetVersion();
-#else
-  return CUDNN_VERSION;
-#endif
+  return at::detail::getCUDAHooks().versionRuntimeCuDNN();
 }

 } // namespace
--- a/torch/distributed/_symmetric_memory/init.py
+++ b/torch/distributed/_symmetric_memory/init.py
@ -1674,8 +1674,6 @@ def empty(  # type: ignore[misc]
    device: _device | None = None,
 ) -> torch.Tensor:
    r"""
-    empty(*size, *, dtype=None, device=None) -> Tensor
-
    Similar to :func:`torch.empty()`. The returned tensor can be used by
    :func:`torch._distributed._symmetric_memory.rendezvous()` to establish a
    symmetric memory tensor among participating processes.
@ -1765,7 +1763,7 @@ def set_backend(name: Literal["NVSHMEM", "CUDA", "NCCL"]) -> None:

    Args:
        backend (str): the backend for symmetric memory allocation. Currently,
-        only "NVSHMEM", "CUDA", "NCCL" are supported.
+            only `"NVSHMEM"`, `"CUDA"`, `"NCCL"` are supported.
    """
    _SymmetricMemory.set_backend(name)

@ -1776,8 +1774,7 @@ def get_backend(device: _device) -> str | None:
    found, return None.

    Args:
-        device (class:`torch.device` or str): the device for which to get the
-        backend.
+        device (`torch.device` or str): the device for which to get the backend.
    """
    return _SymmetricMemory.get_backend(torch.device(device))

@ -1785,9 +1782,10 @@ def get_backend(device: _device) -> str | None:
 def get_mempool_allocator(device: _device):  # type: ignore[no-untyped-def]
    r"""
    Get the MemPool allocator for symmetric memory for a given device.
+
    Args:
-        device (class:`torch.device` or str): the device for which to get the
-        MemPool allocator.
+        device (`torch.device` or str): the device for which to get the MemPool
+            allocator.
    """
    return _SymmetricMemory.get_mempool_allocator(torch.device(device))
Author	SHA1	Message	Date
pytorchbot	d38164a545	Enable Doc builds for: Minor Releases RCs. Minor and Patch Releases final RC (#167494 ) Enable Doc builds for: Minor Releases RCs. Minor and Patch Releases final RC (#167478) Enable Doc builds for 1. Minor Releases RCs 2. Minor and Patch Releases final RC This is done to prevent publishing doc for patch releases when building rcs. See: https://github.com/pytorch/docs/pull/57 Followup after: https://github.com/pytorch/pytorch/pull/153973 Pull Request resolved: https://github.com/pytorch/pytorch/pull/167478 Approved by: https://github.com/svekars, https://github.com/seemethere (cherry picked from commit f6331192b4b105bb8a20823dc02e33b55e5c91e2) Co-authored-by: atalman <atalman@fb.com>	2025-11-10 20:02:50 -05:00
pytorchbot	b002562550	Add doc for Symmetric Memory (#167477 ) Add doc for Symmetric Memory (#166148) Pull Request resolved: https://github.com/pytorch/pytorch/pull/166148 Approved by: https://github.com/fduwjj (cherry picked from commit 1e2e7cb18ba8018d70ec0450fcac7f81cf01c04e) Co-authored-by: Ke Wen <kw2501@meta.com>	2025-11-10 15:18:23 -05:00
pytorchbot	5811a8d7da	[cuDNN][SDPA][Convolution] Expose cuDNN runtime version in CUDA hooks (#167327 ) [cuDNN][SDPA][Convolution] Expose cuDNN runtime version in CUDA hooks (#167111) cuDNN dispatching heuristics rely on versions checks but currently only that compile-time version is exposed, if we want to allow users to resolve https://github.com/pytorch/pytorch/issues/166643 on their end by updating their cuDNN version locally we need to check the runtime version rather than compile-time version. Pull Request resolved: https://github.com/pytorch/pytorch/pull/167111 Approved by: https://github.com/Skylion007 (cherry picked from commit e678450a69f6bf3b6f3ea7657d444ce9bba19940) Co-authored-by: Eddie Yan <eddiey@nvidia.com>	2025-11-07 17:04:27 -05:00
pytorchbot	f36c764ca4	[dynamo][ez] Initialize tracer_output to None by default. (#167366 ) [dynamo][ez] Initialize tracer_output to None by default. (#163169) Summary: In edge cases, tracer_output can be left unset if there's double exception raised which causes the following issue: ``` UnboundLocalError: local variable 'tracer_output' referenced before assignment ``` Default initialize this variable so that it's always present. Test Plan: CI Rollback Plan: Differential Revision: D82652815 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163169 Approved by: https://github.com/tugsbayasgalan (cherry picked from commit 6189a5f7315ac5affdaeafdbea0a85d14925506d) Co-authored-by: Zhengxu Chen <zhxchen17@meta.com>	2025-11-07 17:03:33 -05:00
pytorchbot	6877288115	Change forkserver test to only run below 3.13.8 (#167361 ) Change forkserver test to only run below 3.13.8 (#165667) A multiprocessing bug is fixed in 3.13.8, see [https://docs.python.org/3.13/whatsnew/changelog.html](https://l.workplace.com/l.php?u=https%3A%2F%2Fdocs.python.org%2F3.13%2Fwhatsnew%2Fchangelog.html&h=AT0qUhHJq5c2UJvQaq9_MrSo0mVhwn1VOfq1nDQl2C1UOhDI80RMbzVayhG7LSAT1uYHKtkftKnBDwiGMhbw0YRvQLe5vwE01qejpPFautHvU3LXeOE1KChPykqz3qnCRzk7czu_iNzQ05shR4F1N_qYOzR5YxejA52ZZQ), [gh-126631](https://github.com/python/cpython/issues/126631) So this test will fail when we update to python 3.13.8 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165667 Approved by: https://github.com/malfet (cherry picked from commit d4a713cd9c8ea1dc13917d3311d73c13914306a6) Co-authored-by: Shangdi Yu <shangdiy@meta.com>	2025-11-07 16:58:33 -05:00