Remove pytorch quant docs since we are moving to torchao

Summary: att Test Plan: doc page generated from CI Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 14150e4e3a5d65f1a9eb5a4201fcd4bac09920f1 Pull Request resolved: https://github.com/pytorch/pytorch/pull/157766
Fix unbound local when an error occurs before pool is initialized (#156750 )
2025-10-30 19:54:53 +08:00 · 2025-07-09 15:31:30 -07:00 · 2025-07-08 00:28:21 +00:00 · 2025-07-07 17:13:21 -07:00 · 2025-07-08 00:11:52 +00:00 · 2025-07-08 00:11:52 +00:00
812 changed files with 19751 additions and 6435 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -383,7 +383,7 @@ if [[ -n "${CI:-}" ]]; then
 fi

 # Build image
-tar ch . | docker build \
+docker build \
       ${no_cache_flag} \
       ${progress_flag} \
       --build-arg "BUILD_ENVIRONMENT=${image}" \
@ -422,7 +422,7 @@ tar ch . | docker build \
       -f $(dirname ${DOCKERFILE})/Dockerfile \
       -t "$tmp_tag" \
       "$@" \
-       -
+       .

 # NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
 # for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-c8757738a7418249896224430ce84888e8ecdd79
+ae848267bebc65c6181e8cc5e64a6357d2679260
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -10,6 +10,8 @@ else
  arch_path='sbsa'
 fi

+NVSHMEM_VERSION=3.3.9
+
 function install_cuda {
  version=$1
  runfile=$2
@ -40,13 +42,52 @@ function install_cudnn {
  rm -rf tmp_cudnn
 }

+function install_nvshmem {
+  cuda_major_version=$1      # e.g. "12"
+  nvshmem_version=$2         # e.g. "3.3.9"
+
+  case "${arch_path}" in
+    sbsa)
+      dl_arch="aarch64"
+      ;;
+    x86_64)
+      dl_arch="x64"
+      ;;
+    *)
+      dl_arch="${arch}"
+      ;;
+  esac
+
+  tmpdir="tmp_nvshmem"
+  mkdir -p "${tmpdir}" && cd "${tmpdir}"
+
+  # nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
+  filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
+  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"
+
+  # download, unpack, install
+  wget -q "${url}"
+  tar xf "${filename}.tar.gz"
+  cp -a "libnvshmem/include/"* /usr/local/include/
+  cp -a "libnvshmem/lib/"*     /usr/local/lib/
+
+  # cleanup
+  cd ..
+  rm -rf "${tmpdir}"
+
+  echo "nvSHMEM ${nvshmem_version} for CUDA ${cuda_major_version} (${arch_path}) installed."
+}
+
+
 function install_126 {
  CUDNN_VERSION=9.10.2.21
-  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
+  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux

  install_cudnn 12 $CUDNN_VERSION

+  install_nvshmem 12 $NVSHMEM_VERSION
+
  CUDA_VERSION=12.6 bash install_nccl.sh

  CUDA_VERSION=12.6 bash install_cusparselt.sh
@ -56,13 +97,15 @@ function install_126 {

 function install_129 {
  CUDNN_VERSION=9.10.2.21
-  echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
+  echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  # install CUDA 12.9.1 in the same container
  install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  install_cudnn 12 $CUDNN_VERSION

+  install_nvshmem 12 $NVSHMEM_VERSION
+
  CUDA_VERSION=12.9 bash install_nccl.sh

  CUDA_VERSION=12.9 bash install_cusparselt.sh
@ -106,13 +149,15 @@ function prune_126 {

 function install_128 {
  CUDNN_VERSION=9.8.0.87
-  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
+  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  # install CUDA 12.8.1 in the same container
  install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  install_cudnn 12 $CUDNN_VERSION

+  install_nvshmem 12 $NVSHMEM_VERSION
+
  CUDA_VERSION=12.8 bash install_nccl.sh

  CUDA_VERSION=12.8 bash install_cusparselt.sh
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -383,6 +383,6 @@ cmake==4.0.0
 tlparse==0.3.30
 #Description: required for log parsing

-cuda-bindings>=12.0,<13.0
+cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
 #Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits.
 #test that import: test_cuda.py
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1 +0,0 @@
-../../docs/requirements.txt
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -0,0 +1,61 @@
+sphinx==5.3.0
+#Description: This is used to generate PyTorch docs
+#Pinned versions: 5.3.0
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
+
+# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
+# but it doesn't seem to work and hangs around idly. The initial thought is probably
+# something related to Docker setup. We can investigate this later
+
+sphinxcontrib.katex==0.8.6
+#Description: This is used to generate PyTorch docs
+#Pinned versions: 0.8.6
+
+sphinxext-opengraph==0.9.1
+#Description: This is used to generate PyTorch docs
+#Pinned versions: 0.9.1
+
+sphinx_sitemap==2.6.0
+#Description: This is used to generate sitemap for PyTorch docs
+#Pinned versions: 2.6.0
+
+matplotlib==3.5.3 ; python_version < "3.13"
+matplotlib==3.6.3 ; python_version >= "3.13"
+#Description: This is used to generate PyTorch docs
+#Pinned versions: 3.6.3 if python > 3.12. Otherwise 3.5.3.
+
+tensorboard==2.13.0 ; python_version < "3.13"
+tensorboard==2.18.0 ; python_version >= "3.13"
+#Description: This is used to generate PyTorch docs
+#Pinned versions: 2.13.0
+
+breathe==4.34.0
+#Description: This is used to generate PyTorch C++ docs
+#Pinned versions: 4.34.0
+
+exhale==0.2.3
+#Description: This is used to generate PyTorch C++ docs
+#Pinned versions: 0.2.3
+
+docutils==0.16
+#Description: This is used to generate PyTorch C++ docs
+#Pinned versions: 0.16
+
+bs4==0.0.1
+#Description: This is used to generate PyTorch C++ docs
+#Pinned versions: 0.0.1
+
+IPython==8.12.0
+#Description: This is used to generate PyTorch functorch docs
+#Pinned versions: 8.12.0
+
+myst-nb==0.17.2
+#Description: This is used to generate PyTorch functorch docs
+#Pinned versions: 0.13.2
+
+# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
+python-etcd==0.4.5
+sphinx-copybutton==0.5.0
+sphinx-design==0.4.0
+sphinxcontrib-mermaid==1.0.0
+myst-parser==0.18.1
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.3.1
+3.4.0
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -51,20 +51,22 @@ else
 fi

 cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
+EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")

-TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
 case ${CUDA_VERSION} in
-    12.8|12.9)
-        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX" #removing sm_50-sm_70 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
-        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+    #removing sm_50-sm_70 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
+    12.8)
+        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0"
+        ;;
+    12.9)
+        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
        # WAR to resolve the ld error in libtorch build with CUDA 12.9
-        if [[ "$DESIRED_CUDA" == "cu129" && "$PACKAGE_TYPE" == "libtorch" ]]; then
+        if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
            TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
        fi
        ;;
    12.6)
-        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
-        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+        TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
        ;;
    *)
        echo "unknown cuda version $CUDA_VERSION"
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -393,10 +393,8 @@ else
    # This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
    # is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
    # 16 CPUs
-    if [ -z "$MAX_JOBS_OVERRIDE" ]; then
-      MAX_JOBS=$(nproc --ignore=4)
-      export MAX_JOBS
-    fi
+    MAX_JOBS=$(nproc --ignore=4)
+    export MAX_JOBS

    # NB: Install outside of source directory (at the same level as the root
    # pytorch folder) so that it doesn't get cleaned away prior to docker push.
--- a/.ci/pytorch/common-build.sh
+++ b/.ci/pytorch/common-build.sh
@ -13,6 +13,13 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
    fi

    if which sccache > /dev/null; then
+        # Clear SCCACHE_BUCKET and SCCACHE_REGION if they are empty, otherwise
+        # sccache will complain about invalid bucket configuration
+        if [[ -z "${SCCACHE_BUCKET:-}" ]]; then
+          unset SCCACHE_BUCKET
+          unset SCCACHE_REGION
+        fi
+
        # Save sccache logs to file
        sccache --stop-server > /dev/null  2>&1 || true
        rm -f ~/sccache_error.log || true
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -11,6 +11,8 @@ export TERM=vt100

 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+# shellcheck source=./common-build.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"

 # Do not change workspace permissions for ROCm and s390x CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
@ -163,8 +165,6 @@ elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
  # setting PYTHON_TEST_EXTRA_OPTION
  export PYTHON_TEST_EXTRA_OPTION="--xpu"
-  # Disable sccache for xpu test due to flaky issue https://github.com/pytorch/pytorch/issues/143585
-  sudo rm -rf /opt/cache
 fi

 if [[ "$TEST_CONFIG" == *crossref* ]]; then
@ -333,9 +333,9 @@ test_h100_distributed() {
 test_h100_symm_mem() {
  # symmetric memory test
  time python test/run_test.py --include distributed/test_symmetric_memory.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
-  time TORCH_SYMMMEM=NVSHMEM python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
-  time TORCH_SYMMMEM=NVSHMEM python test/run_test.py --include distributed/test_nvshmem_triton.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
-  time TORCH_SYMMMEM=NCCL python test/run_test.py --include distributed/test_nccl.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include distributed/test_nvshmem_triton.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include distributed/test_nccl.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

@ -368,6 +368,16 @@ test_dynamo_wrapped_shard() {
  assert_git_not_dirty
 }

+test_einops() {
+  pip install einops==0.6.1
+  time python test/run_test.py --einops --verbose --upload-artifacts-while-running
+  pip install einops==0.7.0
+  time python test/run_test.py --einops --verbose --upload-artifacts-while-running
+  pip install einops==0.8.1
+  time python test/run_test.py --einops --verbose --upload-artifacts-while-running
+  assert_git_not_dirty
+}
+

 test_inductor_distributed() {
  # Smuggle a few multi-gpu tests here so that we don't have to request another large node
@ -426,14 +436,21 @@ test_inductor_aoti() {
    python3 tools/amd_build/build_amd.py
  fi
  if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
-    BUILD_AOT_INDUCTOR_TEST=1 TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop
+    BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop)
    # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
-    LD_LIBRARY_PATH=/opt/conda/envs/py_3.10/lib/:${TORCH_LIB_DIR}:$LD_LIBRARY_PATH
-    CPP_TESTS_DIR="${BUILD_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
+    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
  else
-    BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
-    CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
+    BUILD_COMMAND=(python setup.py develop)
+    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
  fi
+
+  # aoti cmake custom command requires `torch` to be installed
+  # initialize the cmake build cache and install torch
+  /usr/bin/env "${BUILD_COMMAND[@]}"
+  # rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
+  /usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
+
+  /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
 }

 test_inductor_cpp_wrapper_shard() {
@ -446,47 +463,26 @@ test_inductor_cpp_wrapper_shard() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

-  if [[ "$1" -eq "2" ]]; then
-    # For now, manually put the opinfo tests in shard 2, and all other tests in
-    # shard 1.  Run all CPU tests, as well as specific GPU tests triggering past
-    # bugs, for now.
-    python test/run_test.py \
-      --include inductor/test_torchinductor_opinfo \
-      -k 'linalg or to_sparse or TestInductorOpInfoCPU' \
-      --verbose
-    exit
-  fi
-
  # Run certain inductor unit tests with cpp wrapper. In the end state, we
  # should be able to run all the inductor unit tests with cpp_wrapper.
+  #
+  # TODO: I'm pretty sure that "TestInductorOpInfoCPU" is not a valid filter,
+  # but change that in another PR to more accurately monitor the increased CI
+  # usage.
+  python test/run_test.py \
+    --include inductor/test_torchinductor_opinfo \
+    -k 'linalg or to_sparse or TestInductorOpInfoCPU' \
+    --shard "$1" "$NUM_TEST_SHARDS" \
+    --verbose
  python test/run_test.py \
    --include inductor/test_torchinductor inductor/test_max_autotune inductor/test_cpu_repro \
+    --shard "$1" "$NUM_TEST_SHARDS" \
+    --verbose
+  python test/run_test.py --inductor \
+    --include test_torch \
+    -k 'take' \
+    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose
-  python test/run_test.py --inductor --include test_torch -k 'take' --verbose
-
-  # Run inductor benchmark tests with cpp wrapper.
-  # Skip benchmark tests if it's in rerun-disabled-mode.
-  if [[ "${PYTORCH_TEST_RERUN_DISABLED_TESTS}" == "1" ]]; then
-    echo "skip dynamo benchmark tests for rerun-disabled-test"
-  else
-    echo "run dynamo benchmark tests with cpp wrapper"
-    python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
-    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
-    --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
-    python benchmarks/dynamo/check_accuracy.py \
-      --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}inductor_timm_training.csv"
-
-    python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-      --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-    python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-      --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-    python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-      --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-    python benchmarks/dynamo/check_accuracy.py \
-      --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}inductor_torchbench_inference.csv"
-  fi
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -1698,11 +1694,11 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
  fi
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
-  install_torchaudio cuda
  install_torchvision
-  checkout_install_torchbench hf_T5 llama moco
  PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
-  test_inductor_aoti
+  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
+    test_inductor_aoti
+  fi
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
  test_inductor_shard "${SHARD_NUMBER}"
@ -1711,6 +1707,8 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
      test_inductor_distributed
    fi
  fi
+elif [[ "${TEST_CONFIG}" == *einops* ]]; then
+  test_einops
 elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
  install_torchvision
  test_dynamo_wrapped_shard "${SHARD_NUMBER}"
@ -1760,7 +1758,7 @@ elif [[ "${TEST_CONFIG}" == smoke ]]; then
  test_python_smoke
 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
  test_h100_distributed
-elif [[ "${TEST_CONFIG}" == test_h100_symm_mem ]]; then
+elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
  test_h100_symm_mem
 else
  install_torchvision
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -75,8 +75,8 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
 TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"

-# CUDA 12.8 builds have triton for Linux and Linux aarch64 binaries.
-if [[ "$DESIRED_CUDA" == cu128 ]]; then
+# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries.
+if [[ "$DESIRED_CUDA" == "cu129" ]]; then
  TRITON_CONSTRAINT="platform_system == 'Linux'"
 fi

--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@ -125,7 +125,7 @@ runs:
        TAG: ${{ steps.parse-ref.outputs.tag }}
        EVENT_NAME: ${{ github.event_name }}
        SCHEDULE: ${{ github.event.schedule }}
-        HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
+        HEAD_BRANCH: ${{ steps.parse-ref.outputs.branch }}
      id: filter
      run: |
        echo "Workflow: ${GITHUB_WORKFLOW}"
--- a/.github/actions/reuse-old-whl/reuse_old_whl.py
+++ b/.github/actions/reuse-old-whl/reuse_old_whl.py
@ -304,8 +304,7 @@ def unzip_artifact_and_replace_files() -> None:


 def set_output() -> None:
-    # Disable for now so we can monitor first
-    # pass
+    print("Setting output reuse=true")
    if os.getenv("GITHUB_OUTPUT"):
        with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
            print("reuse=true", file=env)
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-4e94321c54617dd738a05bfedfc28bc0fa635b5c
+70caf76066ef2c1054d6128b11769dc816a779e7
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-926700d7832caa552ba2e1fc8302f6a2f4d2f6d8
+1c00dea2c9adb2137903c86b4191e8c247f8fda9
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -275,7 +275,7 @@ def delete_branches() -> None:
        delete_branch(git_repo, branch)


-def delete_old_ciflow_tags() -> None:
+def delete_old_tags() -> None:
    # Deletes ciflow tags if they are associated with a closed PR or a specific
    # commit.  Lightweight tags don't have information about the date they were
    # created, so we can't check how old they are.  The script just assumes that
@ -288,23 +288,29 @@ def delete_old_ciflow_tags() -> None:
        delete_branch(git_repo, f"refs/tags/{tag}")

    tags = git_repo._run_git("tag").splitlines()
-    open_pr_numbers = [x["number"] for x in get_open_prs()]

+    CIFLOW_TAG_REGEX = re.compile(r"^ciflow\/.*\/(\d{5,6}|[0-9a-f]{40})$")
+    AUTO_REVERT_TAG_REGEX = re.compile(r"^trunk\/[0-9a-f]{40}$")
    for tag in tags:
        try:
            if ESTIMATED_TOKENS[0] > 400:
                print("Estimated tokens exceeded, exiting")
                break
-            if not tag.startswith("ciflow/"):
+
+            if not CIFLOW_TAG_REGEX.match(tag) and not AUTO_REVERT_TAG_REGEX.match(tag):
                continue
-            re_match_pr = re.match(r"^ciflow\/.*\/(\d{5,6})$", tag)
-            re_match_sha = re.match(r"^ciflow\/.*\/([0-9a-f]{40})$", tag)
-            if re_match_pr:
-                pr_number = int(re_match_pr.group(1))
-                if pr_number in open_pr_numbers:
-                    continue
-                delete_tag(tag)
-            elif re_match_sha:
+
+            # This checks the date of the commit associated with the tag instead
+            # of the tag itself since lightweight tags don't have this
+            # information.  I think it should be ok since this only runs once a
+            # day
+            tag_info = git_repo._run_git("show", "-s", "--format=%ct", tag)
+            tag_timestamp = int(tag_info.strip())
+            # Maybe some timezone issues, but a few hours shouldn't matter
+            tag_age_days = (datetime.now().timestamp() - tag_timestamp) / SEC_IN_DAY
+
+            if tag_age_days > 7:
+                print(f"[{tag}] Tag is older than 7 days, deleting")
                delete_tag(tag)
        except Exception as e:
            print(f"Failed to check tag {tag}: {e}")
@ -312,4 +318,4 @@ def delete_old_ciflow_tags() -> None:

 if __name__ == "__main__":
    delete_branches()
-    delete_old_ciflow_tags()
+    delete_old_tags()
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -18,6 +18,7 @@ import yaml


 REENABLE_TEST_REGEX = "(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) (#|https://github.com/pytorch/pytorch/issues/)([0-9]+)"
+MAIN_BRANCH = "main"

 PREFIX = "test-config/"

@ -97,7 +98,7 @@ def parse_args() -> Any:
    parser.add_argument(
        "--branch",
        type=str,
-        default="main",
+        default=MAIN_BRANCH,
        help="the branch name",
    )
    return parser.parse_args()
@ -456,6 +457,7 @@ def download_json(url: str, headers: dict[str, str], num_retries: int = 3) -> An


 def set_output(name: str, val: Any) -> None:
+    print(f"Setting output {name}={val}")
    if os.getenv("GITHUB_OUTPUT"):
        with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
            print(f"{name}={val}", file=env)
@ -495,13 +497,20 @@ def check_for_setting(labels: set[str], body: str, setting: str) -> bool:


 def perform_misc_tasks(
-    labels: set[str], test_matrix: dict[str, list[Any]], job_name: str, pr_body: str
+    labels: set[str],
+    test_matrix: dict[str, list[Any]],
+    job_name: str,
+    pr_body: str,
+    branch: Optional[str] = None,
 ) -> None:
    """
    In addition to apply the filter logic, the script also does the following
    misc tasks to set keep-going and is-unstable variables
    """
-    set_output("keep-going", check_for_setting(labels, pr_body, "keep-going"))
+    set_output(
+        "keep-going",
+        branch == MAIN_BRANCH or check_for_setting(labels, pr_body, "keep-going"),
+    )
    set_output(
        "ci-verbose-test-logs",
        check_for_setting(labels, pr_body, "ci-verbose-test-logs"),
@ -624,6 +633,7 @@ def main() -> None:
        test_matrix=filtered_test_matrix,
        job_name=args.job_name,
        pr_body=pr_body if pr_body else "",
+        branch=args.branch,
    )

    # Set the filtered test matrix as the output
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -17,7 +17,7 @@ from typing import Optional

 # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
 CUDA_ARCHES = ["12.6", "12.8", "12.9"]
-CUDA_STABLE = "12.6"
+CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
    "12.6": "12.6.3",
    "12.8": "12.8.1",
@ -54,7 +54,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -71,7 +71,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -88,6 +88,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@ -136,10 +136,10 @@ def find_job_id_name(args: Any) -> tuple[str, str]:


 def set_output(name: str, val: Any) -> None:
+    print(f"Setting output {name}={val}")
    if os.getenv("GITHUB_OUTPUT"):
        with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
            print(f"{name}={val}", file=env)
-        print(f"setting {name}={val}")
    else:
        print(f"::set-output name={name}::{val}")

--- a/.github/scripts/parse_ref.py
+++ b/.github/scripts/parse_ref.py
@ -5,6 +5,7 @@ import re


 def set_output(name: str, val: str) -> None:
+    print(f"Setting output {name}={val}")
    if os.getenv("GITHUB_OUTPUT"):
        with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
            print(f"{name}={val}", file=env)
--- a/.github/scripts/test_delete_old_branches.py
+++ b/.github/scripts/test_delete_old_branches.py
@ -0,0 +1,56 @@
+import os
+import unittest
+from datetime import datetime
+from unittest.mock import MagicMock, patch
+
+
+os.environ["GITHUB_TOKEN"] = "test_token"
+
+from delete_old_branches import delete_old_tags
+
+
+@patch("delete_old_branches.delete_branch")
+@patch("gitutils.GitRepo._run_git")
+class TestDeleteTag(unittest.TestCase):
+    def test_delete_tag(
+        self, mock_run_git: "MagicMock", mock_delete_tag: "MagicMock"
+    ) -> None:
+        for tag in [
+            "ciflow/branch/12345",
+            "ciflow/commitsha/1234567890abcdef1234567890abcdef12345678",
+            "trunk/1234567890abcdef1234567890abcdef12345678",
+        ]:
+            mock_run_git.side_effect = [
+                tag,
+                str(int(datetime.now().timestamp() - 8 * 24 * 60 * 60)),  # 8 days ago
+            ]
+            delete_old_tags()
+            mock_delete_tag.assert_called_once()
+            mock_delete_tag.reset_mock()
+
+            # Don't delete if the tag is not old enough
+            mock_run_git.side_effect = [
+                tag,
+                str(int(datetime.now().timestamp() - 6 * 24 * 60 * 60)),  # 6 days ago
+            ]
+            delete_old_tags()
+            mock_delete_tag.assert_not_called()
+
+    def test_do_not_delete_tag(
+        self, mock_run_git: "MagicMock", mock_delete_tag: "MagicMock"
+    ) -> None:
+        for tag in [
+            "ciflow/doesntseemtomatch",
+            "trunk/doesntseemtomatch",
+            "doesntseemtomatch",
+        ]:
+            mock_run_git.side_effect = [
+                tag,
+                str(int(datetime.now().timestamp() - 8 * 24 * 60 * 60)),  # 8 days ago
+            ]
+            delete_old_tags()
+            mock_delete_tag.assert_not_called()
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -69,11 +69,6 @@ on:
        required: false
        type: string
        default: ""
-      max-jobs:
-        description: |
-          Overwrite the number of jobs to use for the build
-        required: false
-        type: string
      disable-monitor:
        description: |
          Disable utilization monitoring for build job
@ -266,7 +261,6 @@ jobs:
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          MAX_JOBS_OVERRIDE: ${{ inputs.max-jobs }}
        run: |
          START_TIME=$(date +%s)
          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
@ -286,12 +280,6 @@ jobs:
            DOCKER_SHELL_CMD=
          fi

-          if [[ ${MAX_JOBS_OVERRIDE} == "" ]]; then
-            MAX_JOBS="$(nproc --ignore=2)"
-          else
-            MAX_JOBS="${MAX_JOBS_OVERRIDE}"
-          fi
-
          # Leaving 1GB for the runner and other things
          TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
          # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
@ -303,8 +291,7 @@ jobs:
          # shellcheck disable=SC2086
          container_name=$(docker run \
            -e BUILD_ENVIRONMENT \
-            -e MAX_JOBS=${MAX_JOBS} \
-            -e MAX_JOBS_OVERRIDE \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e AWS_DEFAULT_REGION \
            -e PR_NUMBER \
            -e SHA1 \
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -90,10 +90,13 @@ jobs:
    environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }}
    runs-on: ${{ matrix.runner }}
    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        if: ${{ !contains(matrix.runner, 'gcp.a100') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+        if: ${{ matrix.runner != 'B200' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
@ -105,18 +108,31 @@ jobs:
        with:
          no-sudo: true

+      - name: Setup Python
+        if: matrix.runner == 'B200'
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: '3.12'
+          cache: pip
+
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
-        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && matrix.runner != 'B200'

      - name: configure aws credentials
-        if : ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+        if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
          role-to-assume: ${{ inputs.aws-role-to-assume }}
          role-session-name: gha-linux-test
          aws-region: us-east-1

+      - name: Login to Amazon ECR
+        if: ${{ inputs.aws-role-to-assume != '' && matrix.runner == 'B200' }}
+        id: login-ecr
+        continue-on-error: true
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
@ -148,17 +164,17 @@ jobs:
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        id: install-nvidia-driver
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && matrix.runner != 'B200' }}

      - name: Setup GPU_FLAG for docker run
        id: setup-gpu-flag
        run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
-        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || matrix.runner == 'B200') }}

      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
        id: setup-sscache-port-flag
        run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
-        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && matrix.runner != 'B200' }}

      - name: Lock NVIDIA A100 40GB Frequency
        run: |
@ -225,6 +241,12 @@ jobs:
        run: |
          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"

+      - name: Preserve github env variables for use in docker
+        shell: bash
+        run: |
+          env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+          env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+
      - name: Test
        id: test
        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
@ -253,8 +275,8 @@ jobs:
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
          # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
-          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-          SCCACHE_REGION: us-east-1
+          SCCACHE_BUCKET: ${{ matrix.runner != 'B200' && 'ossci-compiler-cache-circleci-v2' || '' }}
+          SCCACHE_REGION: ${{ matrix.runner != 'B200' && 'us-east-1' || '' }}
          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
@ -264,7 +286,6 @@ jobs:
          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
          ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
        run: |
          set -x
@ -290,10 +311,6 @@ jobs:
            # if for some reason cleanup action doesn't stop container
            # when job is cancelled
            DOCKER_SHELL_CMD="sleep 12h"
-
-            # since some steps are skipped on s390x, if they are necessary, run them here
-            env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
-            env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
          else
            SHM_OPTS="--shm-size=${SHM_SIZE}"
            JENKINS_USER="--user jenkins"
@ -345,7 +362,6 @@ jobs:
            -e HUGGING_FACE_HUB_TOKEN \
            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
            -e DASHBOARD_TAG \
-            -e IS_A100_RUNNER \
            -e ARTIFACTS_FILE_SUFFIX \
            --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
            --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
@ -384,6 +400,15 @@ jobs:
          test_config: ${{ matrix.config }}
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}

+      - name: Authenticate with AWS
+        if: ${{ matrix.runner == 'B200' }}
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
+          # The max duration enforced by the server side
+          role-duration-seconds: 18000
+          aws-region: us-east-1
+
      - name: Upload the benchmark results
        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@ -123,7 +123,7 @@ jobs:
            else
              # The runner has access to the S3 bucket via IAM profile without the need
              # for any credential
-              echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"0
+              echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
              echo "SCCACHE_S3_KEY_PREFIX=${GITHUB_WORKFLOW}" >> "${GITHUB_ENV}"
            fi

--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -191,9 +191,6 @@ jobs:
          SHARD_NUMBER: ${{ matrix.shard }}
          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
-          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-          SCCACHE_REGION: us-east-1
-          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -52,6 +52,7 @@ jobs:
      matrix:
        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
        device: ["cuda", "rocm", "xpu", "aarch64"]
+        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
          - device: "rocm"
            rocm_version: "6.4"
@ -67,6 +68,7 @@ jobs:
            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge"
    timeout-minutes: 40
    env:
+      DOCKER_IMAGE: ${{ matrix.device == 'rocm' && format('pytorch/manylinux2_28-builder:rocm{0}', matrix.rocm_version) || matrix.device == 'aarch64' && 'pytorch/manylinux2_28_aarch64-builder:cpu-aarch64' || matrix.docker-image }}
      PY_VERS: ${{ matrix.py_vers }}
      BUILD_DEVICE: ${{ matrix.device }}
      PLATFORM: 'manylinux_2_28_x86_64'
@ -84,34 +86,14 @@ jobs:
      - name: Setup Linux
        uses: ./.github/actions/setup-linux

-      - name: configure aws credentials
-        id: aws_creds
-        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-
-      - name: Calculate docker image
-        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
-          docker-image-name: ${{ matrix.device == 'aarch64' && 'manylinux2_28_aarch64-builder' || 'manylinux2_28-builder' }}
-          # NOTE: CUDA builds are currently built using the cpu tag
-          custom-tag-prefix: ${{ matrix.device == 'rocm' && format('rocm{0}', matrix.rocm_version) || matrix.device == 'aarch64' && 'cpu-aarch64' || 'cpu' }}
-          docker-build-dir: .ci/docker
-
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        with:
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          docker-image: ${{ env.DOCKER_IMAGE }}

      - name: Build Triton wheel
        env:
          IS_RELEASE_TAG: ${{ startsWith(github.event.ref, 'refs/tags/v') }}
-          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
        run: |
          set -x
          mkdir -p "${RUNNER_TEMP}/artifacts/"
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -35,7 +35,6 @@ jobs:
      contents: write
    outputs:
      pt_release_name: ${{ steps.release_name.outputs.pt_release_name }}
-      pt_pep517_release_name: ${{ steps.release_name.outputs.pt_pep517_release_name }}
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
@ -54,57 +53,36 @@ jobs:
          tag_or_branch="${tag_or_branch#refs/heads/}"
          # replace directory separators with _ in branch name
          tag_or_branch="${tag_or_branch//\//_}"
-          torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')"
-          {
-            echo "PT_RELEASE_NAME=pytorch-$tag_or_branch";
-            echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz";
-            echo "PT_PEP517_RELEASE_FILE=torch-${torch_version}.tar.gz";
-          } >> "$GITHUB_ENV"
+          echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
+          echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
      - name: Checkout optional submodules
        run: python3 tools/optional_submodules.py
      - name: Create source distribution
        run: |
-          # Create new folder with specified name so extracting the archive yields that
-          rm -rf "/tmp/$PT_RELEASE_NAME"
-          cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
-          mv "/tmp/$PT_RELEASE_NAME" .
-          # Cleanup
-          rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
-          find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
-          # Create archive
-          tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
-          echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
-      - name: Create PEP 517 compatible source distribution
-        run: |
-          pip install build==1.2.2.post1 || exit 1
-          python -m build --sdist || exit 1
-          cd dist || exit 1
+            # Create new folder with specified name so extracting the archive yields that
+            rm -rf "/tmp/$PT_RELEASE_NAME"
+            cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
+            mv "/tmp/$PT_RELEASE_NAME" .
+            # Cleanup
+            rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
+            find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
+            # Create archive
+            tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
+            echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
      - name: Upload source distribution for release
        if: ${{ github.event_name == 'release' }}
        uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2
        with:
-          files: |
-            ${{ env.PT_RELEASE_FILE }}
-            ${{ env.PT_PEP517_RELEASE_FILE }}
+          files: ${{env.PT_RELEASE_FILE}}
      - name: Upload source distribution to GHA artifacts for release tags
        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
        with:
          name: ${{ env.PT_RELEASE_FILE }}
          path: ${{ env.PT_RELEASE_FILE }}
-      - name: Upload PEP 517 source distribution to GHA artifacts for release tags
-        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
-        with:
-          name: ${{ env.PT_PEP517_RELEASE_FILE }}
-          path: dist/${{ env.PT_PEP517_RELEASE_FILE }}
      - name: Set output
        id: release_name
-        run: |
-          {
-            echo "name=pt_release_name::${{ env.PT_RELEASE_FILE }}";
-            echo "name=pt_pep517_release_name::${{ env.PT_PEP517_RELEASE_FILE }}";
-          } >> "${GITHUB_OUTPUT}"
+        run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"

  upload_source_code_to_s3:
    if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
@ -120,9 +98,6 @@ jobs:
      - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
        with:
          name: ${{ needs.release.outputs.pt_release_name }}
-      - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
-        with:
-          name: ${{ needs.release.outputs.pt_pep517_release_name }}
      - name: Configure AWS credentials(PyTorch account)
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
@ -133,9 +108,7 @@ jobs:
          s3-bucket: pytorch
          s3-prefix: source_code/test
          if-no-files-found: warn
-          path: |
-            ${{ needs.release.outputs.pt_release_name }}
-            ${{ needs.release.outputs.pt_pep517_release_name }}
+          path: ${{ needs.release.outputs.pt_release_name }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -136,7 +136,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -252,7 +252,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -368,7 +368,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -484,7 +484,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -600,7 +600,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -716,7 +716,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -61,7 +61,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_6-test:  # Testing
@ -108,7 +108,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_8-test:  # Testing
@ -155,7 +155,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_9-test:  # Testing
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -131,7 +131,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_6-test:  # Testing
@ -200,7 +200,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_8-test:  # Testing
@ -269,7 +269,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_9-test:  # Testing
@ -744,7 +744,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_6-test:  # Testing
@ -813,7 +813,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_8-test:  # Testing
@ -882,7 +882,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_9-test:  # Testing
@ -1357,7 +1357,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_6-test:  # Testing
@ -1407,74 +1407,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_11-cuda12_6-full-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.11"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_11-cuda12_6-full
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_6-full-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_11-cuda12_6-full-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_6-full
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_6-full-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_11-cuda12_6-full-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_6-full
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_11-cuda12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -1494,7 +1426,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_8-test:  # Testing
@ -1544,6 +1476,74 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_11-cuda12_8-full-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-cuda12_8-full
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_8-full-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-cuda12_8-full-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_8-full
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_8-full-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda12_8-full-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_8-full
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_11-cuda12_9-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -1563,7 +1563,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_9-test:  # Testing
@ -2038,7 +2038,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_6-test:  # Testing
@ -2107,7 +2107,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
@ -2176,7 +2176,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_9-test:  # Testing
@ -2651,7 +2651,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_6-test:  # Testing
@ -2720,7 +2720,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_8-test:  # Testing
@ -2789,7 +2789,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_9-test:  # Testing
@ -3264,7 +3264,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_6-test:  # Testing
@ -3333,7 +3333,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_8-test:  # Testing
@ -3402,7 +3402,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_9-test:  # Testing
--- a/.github/workflows/h100-symm-mem.yml
+++ b/.github/workflows/h100-symm-mem.yml
@ -38,7 +38,7 @@ jobs:
      cuda-arch-list: '9.0'
      test-matrix: |
        { include: [
-          { config: "h100_distributed", shard: 1, num_shards: 1, runner: "linux.aws.h100.4" },
+          { config: "h100-symm-mem", shard: 1, num_shards: 1, runner: "linux.aws.h100.4" },
        ]}
    secrets: inherit

--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@ -13,7 +13,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  linux-jammy-cpu-py3_9-gcc11-inductor-build:
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -13,7 +13,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-default-label-prefix:
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@ -16,7 +16,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-default-label-prefix:
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -10,7 +10,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-default-label-prefix:
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@ -48,7 +48,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-label-type:
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -63,7 +63,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-label-type:
--- a/.github/workflows/inductor-perf-test-nightly-rocm.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml
@ -5,7 +5,7 @@ on:
    tags:
      - ciflow/inductor-perf-test-nightly-rocm/*
  schedule:
-    - cron: 0 7 * * 0
+    - cron: 0 7 * * 0,3
  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
  workflow_dispatch:
@ -88,18 +88,23 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
-          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
          { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
          { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
          { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
          { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
          { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
        ]}
    secrets: inherit

--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@ -53,7 +53,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-label-type:
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -58,7 +58,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-label-type:
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -63,7 +63,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-label-type:
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -15,7 +15,9 @@ concurrency:
  cancel-in-progress: true


-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-default-label-prefix:
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -12,7 +12,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-unittest
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-label-type:
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -22,7 +22,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  unit-test:
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@ -19,7 +19,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  linux-jammy-cpu-py3_9-gcc11-opbenchmark-build:
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -20,7 +20,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  llm-td:
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -19,7 +19,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  llm-td:
@ -201,6 +203,7 @@ jobs:
          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "einops", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }
        ]}
    secrets: inherit

@ -236,6 +239,7 @@ jobs:
          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "einops", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }
        ]}
    secrets: inherit

--- a/.github/workflows/s390x-periodic.yml
+++ b/.github/workflows/s390x-periodic.yml
@ -15,7 +15,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  llm-td:
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -18,7 +18,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  llm-td:
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -16,7 +16,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  llm-td:
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -122,6 +122,7 @@ is_formatter = true
 [[linter]]
 code = 'MYPY'
 include_patterns = [
+    'setup.py',
    'torch/**/*.py',
    'torch/**/*.pyi',
    'caffe2/**/*.py',
@ -1156,6 +1157,7 @@ exclude_patterns = [
    'torch/_vendor/**',
    'torch/_inductor/fx_passes/serialized_patterns/**',
    'torch/_inductor/autoheuristic/artifacts/**',
+    'torch/utils/model_dump/preact.mjs',
    # These files are all grandfathered in, feel free to remove from this list
    # as necessary
    # NOTE: remove the patterns in the order they are listed
@ -1167,17 +1169,10 @@ exclude_patterns = [
    'test/**',
    'test/test_*',
    'test/[a-hA-h]*/**',
-    'test/inductor/**',
-    'test/dynamo/**',
    'test/distributed/**',
    'torch/**',
    'torch/_*/**',
-    'torch/fx/**',
    'torch/distributed/tensor/**',
-    'torch/[j-o]*/**',
-    'torch/utils/**',
-    'torch/csrc/jit/**',
-    'torch/csrc/jit/[a-o]*/**',
 ]
 init_command = [
    'python3',
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,91 +1,50 @@
-# Include individual top-level files
-include MANIFEST.in
-include BUCK.oss
-include BUILD.bazel
-include CITATION.cff
-include CODEOWNERS
-include Dockerfile
-include LICENSE
-include Makefile
-include NOTICE
-include WORKSPACE
-include .bazelignore .bazelrc .bazelversion
-include .clang-format .clang-tidy
-include .cmakelintrc
-include .coveragerc
-include .dockerignore
-include .flake8
-include .gdbinit
-include .lintrunner.toml
-include .lldbinit
-include docker.Makefile
-include ubsan.supp
+# Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html

-# Include bazel related files
-include *.bzl
-# Include general configuration files
-include *.ini
-# Include important top-level information
-include *.md
-# Include technical text files
-include *.txt
-
-# Include ctags configuration
-include .ctags.d/*.ctags
-
-# Include subfolders completely
-graft .devcontainer
-graft .vscode
+# Include source files in SDist
+include CMakeLists.txt
+include *.bzl *.bazel .bazel* BUILD *.BUILD BUILD.* WORKSPACE
+include BUCK BUCK.*
+include requirements*.txt
+include version.txt
+include [Mm]akefile *.[Mm]akefile [Mm]akefile.*
+include [Dd]ockerfile *.[Dd]ockerfile [Dd]ockerfile.* .dockerignore
 graft android
-# The following folder (assets) is empty except for a .gitignore file, which
-# will not be included in the sdist, hence we include the directory explicitly.
-include android/test_app/app/src/main/assets
 graft aten
 graft binaries
 graft c10
 graft caffe2
 graft cmake
-graft torch
-graft tools
-graft test
-graft docs
-graft ios
-graft third_party
-graft test
-graft benchmarks
-graft scripts
-graft mypy_plugins
-graft modules
 graft functorch
+graft third_party
+graft tools
+graft torch
 graft torchgen
+# FIXME: torch-xla build during codegen will fail if include this file in wheel
+exclude torchgen/BUILD.bazel

-# The following exclusions omit parts from third-party dependencies that
-# contain invalid symlinks[1] and that are not needed for pytorch, such as
-# bindings for unused languages
-prune third_party/ittapi/rust
-prune third_party/flatbuffers/java
-prune third_party/flatbuffers/kotlin
-prune third_party/nccl/pkg/debian
-prune third_party/opentelemetry-cpp/third_party/prometheus-cpp/cmake/project-import-*
+# Misc files and directories in SDist
+include *.md
+include CITATION.cff
+include LICENSE NOTICE
+include mypy*.ini
+graft benchmarks
+graft docs
+graft mypy_plugins
+graft scripts

-# The following document is also an invalid symlink[1] and superfluous
-exclude third_party/flatbuffers/docs/source/CONTRIBUTING.md
+# Misc files needed for custom setuptools command
+include .gitignore
+include .gitmodules

-# Omit autogenerated code
+# Include test suites in SDist
+graft test
+include pytest.ini
+include .coveragerc
+
+# Prune generated/compiled files
 prune torchgen/packaged
-
-# Omit caches, compiled, and scm related content
 prune */__pycache__
-prune **/.github
-prune **/.gitlab
-global-exclude *.o *.so *.dylib *.a
-global-exclude *.pyc *.swp
-global-exclude .git .git-blame-ignore-revs .gitattributes .gitignore .gitmodules
-global-exclude .gitlab-ci.yml
+global-exclude *.o *.obj *.so *.a *.dylib *.pxd *.dll *.lib *.py[cod]

-# [1] Invalid symlinks for the purposes of Python source distributions are,
-# according to the source distribution format[2] links pointing outside the
-# destination directory or links with a `..` component, which is those of
-# concern here.
-
-# [2] https://packaging.python.org/en/latest/specifications/source-distribution-format/#source-distribution-archive-features
+prune */.git
+global-exclude .git *~ *.swp
--- a/6
+++ b/6
@ -57,7 +57,8 @@ setup-env-cuda:
 setup-env-rocm:
 	$(MAKE) setup-env PYTHON="$(PYTHON)" NIGHTLY_TOOL_OPTS="$(NIGHTLY_TOOL_OPTS) --rocm"

-.lintbin/.lintrunner.sha256: requirements.txt pyproject.toml .lintrunner.toml
+.PHONY: setup-lint
+setup-lint .lintbin/.lintrunner.sha256: requirements.txt pyproject.toml .lintrunner.toml
 	@echo "Setting up lintrunner..."
 	$(PIP) install lintrunner
 	lintrunner init
@ -65,9 +66,6 @@ setup-env-rocm:
 	@mkdir -p .lintbin
 	@sha256sum requirements.txt pyproject.toml .lintrunner.toml > .lintbin/.lintrunner.sha256

-.PHONY: setup-lint
-setup-lint: .lintbin/.lintrunner.sha256
-
 .PHONY: lazy-setup-lint
 lazy-setup-lint: .lintbin/.lintrunner.sha256
 	@if [ ! -x "$(shell command -v lintrunner)" ]; then \
--- a/README.md
+++ b/README.md
@ -200,7 +200,7 @@ If you want to compile with CUDA support, [select a supported version of CUDA fr
 - [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above
 - [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA

-Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware
+Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver, and NVIDIA hardware.

 If you want to disable CUDA support, export the environment variable `USE_CUDA=0`.
 Other potentially useful environment variables may be found in `setup.py`.  If
--- a/RELEASE.md
+++ b/RELEASE.md
@ -50,6 +50,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:

 | PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
 | --- | --- | --- | --- | --- | --- |
+| 2.8 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 12.9 (CUDNN 9.10.2.21) | ROCm 6.4 |
 | 2.7 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8 (CUDNN 9.1.0.70), CUDA 12.6 (CUDNN 9.5.1.17) | CUDA 12.8 (CUDNN 9.7.1.26) | ROCm 6.3 |
 | 2.6 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8, CUDA 12.4 (CUDNN 9.1.0.70) | CUDA 12.6 (CUDNN 9.5.1.17) | ROCm 6.2.4 |
 | 2.5 | >=3.9, <=3.12, (3.13 experimental) | C++17 | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70  | None | ROCm 6.2 |
@ -73,9 +74,9 @@ Following is the release cadence. All future dates below are tentative. For late
 | 2.4 | Jun 2024 | Jul 2024 | Sept 2024 | Not planned |
 | 2.5 | Sep 2024 | Oct 2024 | Nov 2024 |  Not planned |
 | 2.6 | Dec 2024 | Jan 2025 | Not planned | Not planned |
-| 2.7 | Mar 2025 | Apr 2025 | (May 2025) | (Jun 2025) |
+| 2.7 | Mar 2025 | Apr 2025 | Jun 2025 | Not planned |
 | 2.8 | Jun 2025 | Jul 2025 | (Aug 2025) | (Sep 2025) |
-| 2.9 | Aug 2025 | Oct 2025 | (Nov 2025) | (Dec 2025) |
+| 2.9 | Sept 2025 | Oct 2025 | (Nov 2025) | (Dec 2025) |
 | 2.10 | Dec 2025 | Jan 2026 | (Feb 2026) | (Mar 2026) |
 | 2.11 | Mar 2026 | Apr 2026 | (Jun 2026) | (Jul 2026) |

--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -30,7 +30,7 @@ TORCH_API bool isAccelerator(c10::DeviceType device_type);
 template <
    typename... T,
    typename = std::enable_if_t<(std::is_same_v<T, c10::DeviceType> && ...)>>
-TORCH_API inline bool isAcceleratorExcluded(
+inline bool isAcceleratorExcluded(
    c10::DeviceType device_type,
    c10::DeviceType first_excluded,
    T... rest_excluded) {
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -178,7 +178,7 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
 // See Note [Functionalization Pass - Inplace View Ops]
 void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
  view_metas_.push_back(meta);
-  // Manually track the fact that this tensor recieved a metadata mutation!
+  // Manually track the fact that this tensor received a metadata mutation!
  has_metadata_mutation_ = true;
  // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
  maybe_mark_symbolic(meta);
@ -579,7 +579,7 @@ std::vector<Tensor> from_functional_tensor(ITensorListRef t_list) {
  for (const auto& tensor : t_list) {
    // from_functional_tensor(Tensor) has asserts to make sure you don't accidentally call
    // it on a non-functional input,
-    // but from_functional_tensor(TensorList) can recieve a list containing both
+    // but from_functional_tensor(TensorList) can receive a list containing both
    // functional and non-functional tensors.
    // Example of when that can happen: torch.cat(function_input_tensor, global_state_tensor).
    // When that happens, we're okay with only unwrapping the functional tensors.
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -300,7 +300,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
 namespace functionalization {
 namespace impl {

-TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
+inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
    const Tensor& tensor) {
  auto functional_impl =
      static_cast<FunctionalTensorWrapper*>(tensor.unsafeGetTensorImpl());
--- a/aten/src/ATen/NamedTensorUtils.h
+++ b/aten/src/ATen/NamedTensorUtils.h
@ -167,14 +167,14 @@ TORCH_API TensorImpl* propagate_names(

 TORCH_API void propagate_names(TensorImpl* result, /*const */ TensorImpl* src);

-TORCH_API inline void propagate_names(
+inline void propagate_names(
    const TensorBase& result,
    DimnameList names,
    bool validate_names = false) {
  propagate_names(result.unsafeGetTensorImpl(), names, validate_names);
 }

-TORCH_API inline void propagate_names_if_nonempty(
+inline void propagate_names_if_nonempty(
    const TensorBase& result,
    DimnameList names,
    bool validate_names = false) {
@ -182,9 +182,7 @@ TORCH_API inline void propagate_names_if_nonempty(
      result.unsafeGetTensorImpl(), names, validate_names);
 }

-TORCH_API inline void propagate_names(
-    const TensorBase& result,
-    const TensorBase& src) {
+inline void propagate_names(const TensorBase& result, const TensorBase& src) {
  propagate_names(result.unsafeGetTensorImpl(), src.unsafeGetTensorImpl());
 }

--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@ -25,7 +25,7 @@ TORCH_API void set_autocast_cache_enabled(bool enabled);
 // deprecated CUDA-specific autocast APIs
 C10_DEPRECATED_MESSAGE(
    "at::autocast::is_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(at::kCUDA) instead.")
-TORCH_API inline bool is_enabled() {
+inline bool is_enabled() {
  TORCH_WARN_DEPRECATION(
      "at::autocast::",
      __func__,
@ -34,7 +34,7 @@ TORCH_API inline bool is_enabled() {
 }
 C10_DEPRECATED_MESSAGE(
    "at::autocast::set_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(at::kCUDA, enabled) instead.")
-TORCH_API inline void set_enabled(bool enabled) {
+inline void set_enabled(bool enabled) {
  TORCH_WARN_DEPRECATION(
      "at::autocast::",
      __func__,
@ -43,7 +43,7 @@ TORCH_API inline void set_enabled(bool enabled) {
 }
 C10_DEPRECATED_MESSAGE(
    "at::autocast::get_autocast_gpu_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(at::kCUDA) instead.")
-TORCH_API inline at::ScalarType get_autocast_gpu_dtype() {
+inline at::ScalarType get_autocast_gpu_dtype() {
  TORCH_WARN_DEPRECATION(
      "at::autocast::",
      __func__,
@ -52,7 +52,7 @@ TORCH_API inline at::ScalarType get_autocast_gpu_dtype() {
 }
 C10_DEPRECATED_MESSAGE(
    "at::autocast::set_autocast_gpu_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(at::kCUDA, dtype) instead.")
-TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
+inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
  TORCH_WARN_DEPRECATION(
      "at::autocast::",
      __func__,
@ -65,7 +65,7 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
      "at::autocast::is_" #name                                                                      \
      "_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(" #device_type         \
      ") instead.")                                                                                  \
-  TORCH_API inline bool is_##name##_enabled() {                                                      \
+  inline bool is_##name##_enabled() {                                                                \
    TORCH_WARN_DEPRECATION(                                                                          \
        "at::autocast::",                                                                            \
        __func__,                                                                                    \
@ -78,7 +78,7 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
      "at::autocast::set_" #name                                                                     \
      "_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(" #device_type \
      ", enabled) instead.")                                                                         \
-  TORCH_API inline void set_##name##_enabled(bool enabled) {                                         \
+  inline void set_##name##_enabled(bool enabled) {                                                   \
    TORCH_WARN_DEPRECATION(                                                                          \
        "at::autocast::",                                                                            \
        __func__,                                                                                    \
@ -91,7 +91,7 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
      "at::autocast::get_autocast_" #name                                                            \
      "_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(" #device_type            \
      ") instead.")                                                                                  \
-  TORCH_API inline at::ScalarType get_autocast_##name##_dtype() {                                    \
+  inline at::ScalarType get_autocast_##name##_dtype() {                                              \
    TORCH_WARN_DEPRECATION(                                                                          \
        "at::autocast::",                                                                            \
        __func__,                                                                                    \
@ -104,7 +104,7 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
      "at::autocast::set_autocast_" #name                                                            \
      "_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(" #device_type       \
      ", dtype) instead.")                                                                           \
-  TORCH_API inline void set_autocast_##name##_dtype(at::ScalarType dtype) {                          \
+  inline void set_autocast_##name##_dtype(at::ScalarType dtype) {                                    \
    TORCH_WARN_DEPRECATION(                                                                          \
        "at::autocast::",                                                                            \
        __func__,                                                                                    \
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -265,7 +265,7 @@ class TORCH_API TensorBase {
    return impl_->is_contiguous(memory_format);
  }

-  // Like is_contiguous, but more dynamic shape-friendly. Maybe returns a symbolic representation of
+  // Like is_contiguous, but more dynamic shape-friendly. May return a symbolic representation of
  // contiguity instead of SymTrue SymFalse, when results are data-dependent.
  c10::SymBool sym_is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Contiguous) const {
    if (impl_->has_symbolic_sizes_strides()) {
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@ -105,7 +105,7 @@ using supported_primitive_arg_types = guts::typelist::typelist<
 // So a valid input type is one that our boxed functor wrapper can
 // unbox from an IValue into a C++ value.
 //
-// Whereas a valid output type is one that our wrapper can recieve
+// Whereas a valid output type is one that our wrapper can receive
 // as a C++ value from the unboxed functor, and box into an IValue.

 //
--- a/aten/src/ATen/core/jit_type_base.h
+++ b/aten/src/ATen/core/jit_type_base.h
@ -677,7 +677,7 @@ inline TypePtr Type::withContained(std::vector<TypePtr> contained_types) {
 }


-TORCH_API inline bool operator==(const Type& lhs, const Type& rhs) {
+inline bool operator==(const Type& lhs, const Type& rhs) {
  if (C10_UNLIKELY(!rhs.symmetric())) {
    return rhs.equals(lhs);
  }
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@ -202,18 +202,14 @@ class Vectorized<float> {
    store(tmp);
    return tmp[idx];
  }
-  // For boolean version where we want to if any 1/all zero
-  // etc. can be done faster in a different way.
  int zero_mask() const {
-    __at_align__ float tmp[size()];
-    store(tmp);
-    int mask = 0;
-    for (int i = 0; i < size(); ++i) {
-      if (tmp[i] == 0.f) {
-        mask |= (1 << i);
-      }
-    }
-    return mask;
+    uint32x4_t is_zero_vec = vceqzq_f32(values);
+    const int32x4_t shift = vcombine_s32(
+        vcreate_s32(0x0 | (int64_t(0x1) << 32)),
+        vcreate_s32(0x2 | (int64_t(0x3) << 32)));
+    uint32x4_t bits_vec =
+        vshlq_u32(vandq_u32(is_zero_vec, vdupq_n_u32(1)), shift);
+    return vaddvq_u32(bits_vec);
  }
  Vectorized<float> isnan() const {
    return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(values, values)));
--- a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
@ -220,8 +220,32 @@ class Vectorized<c10::Half> : public Vectorized16<
      std::memcpy(ptr, tmp_values, count * sizeof(float16_t));
    }
  }
-  // For boolean version where we want to if any 1/all zero
-  // etc. can be done faster in a different way.
+  int zero_mask() const {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    uint16x8_t is_zero_vec = vceqzq_f16(values);
+    const int16x8_t shift = vcombine_s16(
+        vcreate_s16(
+            0x0 | (int64_t(0x1) << 16) | (int64_t(0x2) << 32) |
+            (int64_t(0x3) << 48)),
+        vcreate_s16(
+            0x4 | (int64_t(0x5) << 16) | (int64_t(0x6) << 32) |
+            (int64_t(0x7) << 48)));
+    uint16x8_t bits_vec =
+        vshlq_u16(vandq_u16(is_zero_vec, vdupq_n_u16(1)), shift);
+    return vaddvq_u16(bits_vec);
+#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    // use known working implmentation.
+    __at_align__ value_type tmp[size()];
+    store(tmp);
+    int mask = 0;
+    for (int i = 0; i < size(); ++i) {
+      if (tmp[i] == 0) {
+        mask |= (1 << i);
+      }
+    }
+    return mask;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+  }
  Vectorized<c10::Half> isnan() const {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    return vreinterpretq_f16_u16(vmvnq_u16(vceqq_f16(values, values)));
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
@ -348,26 +348,6 @@ class Vectorized<int16_t> {
  DEFINE_MEMBER_OP(operator^, int16_t, vec_xor)
 };

-template <>
-Vectorized<int16_t> inline operator<<(
-    const Vectorized<int16_t>& a,
-    const Vectorized<int16_t>& b) {
-  vuint16 shift_vec0 = reinterpret_cast<vuint16>(b.vec0());
-  vuint16 shift_vec1 = reinterpret_cast<vuint16>(b.vec1());
-  return Vectorized<int16_t>{
-      vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
-}
-
-template <>
-Vectorized<int16_t> inline operator>>(
-    const Vectorized<int16_t>& a,
-    const Vectorized<int16_t>& b) {
-  vuint16 shift_vec0 = reinterpret_cast<vuint16>(b.vec0());
-  vuint16 shift_vec1 = reinterpret_cast<vuint16>(b.vec1());
-  return Vectorized<int16_t>{
-      vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
-}
-
 template <>
 Vectorized<int16_t> inline maximum(
    const Vectorized<int16_t>& a,
@ -382,6 +362,8 @@ Vectorized<int16_t> inline minimum(
  return a.minimum(b);
 }

+DEFINE_SHIFT_FUNCS(int16_t)
+
 template <>
 Vectorized<int16_t> C10_ALWAYS_INLINE
 operator+(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
@ -278,26 +278,6 @@ class Vectorized<int32_t> {
  DEFINE_MEMBER_OP(operator^, int32_t, vec_xor)
 };

-template <>
-Vectorized<int32_t> inline operator<<(
-    const Vectorized<int32_t>& a,
-    const Vectorized<int32_t>& b) {
-  vuint32 shift_vec0 = reinterpret_cast<vuint32>(b.vec0());
-  vuint32 shift_vec1 = reinterpret_cast<vuint32>(b.vec1());
-  return Vectorized<int32_t>{
-      vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
-}
-
-template <>
-Vectorized<int32_t> inline operator>>(
-    const Vectorized<int32_t>& a,
-    const Vectorized<int32_t>& b) {
-  vuint32 shift_vec0 = reinterpret_cast<vuint32>(b.vec0());
-  vuint32 shift_vec1 = reinterpret_cast<vuint32>(b.vec1());
-  return Vectorized<int32_t>{
-      vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
-}
-
 template <>
 Vectorized<int32_t> inline maximum(
    const Vectorized<int32_t>& a,
@ -312,6 +292,8 @@ Vectorized<int32_t> inline minimum(
  return a.minimum(b);
 }

+DEFINE_SHIFT_FUNCS(int32_t)
+
 template <>
 Vectorized<int32_t> C10_ALWAYS_INLINE
 operator+(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
@ -231,26 +231,6 @@ class Vectorized<int64_t> {
  DEFINE_MEMBER_OP(operator^, int64_t, vec_xor)
 };

-template <>
-Vectorized<int64_t> inline operator<<(
-    const Vectorized<int64_t>& a,
-    const Vectorized<int64_t>& b) {
-  vuint64 shift_vec0 = reinterpret_cast<vuint64>(b.vec0());
-  vuint64 shift_vec1 = reinterpret_cast<vuint64>(b.vec1());
-  return Vectorized<int64_t>{
-      vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
-}
-
-template <>
-Vectorized<int64_t> inline operator>>(
-    const Vectorized<int64_t>& a,
-    const Vectorized<int64_t>& b) {
-  vuint64 shift_vec0 = reinterpret_cast<vuint64>(b.vec0());
-  vuint64 shift_vec1 = reinterpret_cast<vuint64>(b.vec1());
-  return Vectorized<int64_t>{
-      vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
-}
-
 template <>
 Vectorized<int64_t> inline maximum(
    const Vectorized<int64_t>& a,
@ -265,6 +245,8 @@ Vectorized<int64_t> inline minimum(
  return a.minimum(b);
 }

+DEFINE_SHIFT_FUNCS(int64_t)
+
 template <>
 Vectorized<int64_t> C10_ALWAYS_INLINE
 operator+(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
@ -1,5 +1,6 @@
 #pragma once
 #include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
 #include <c10/macros/Macros.h>
 #include <cstdint>

@ -39,6 +40,19 @@ using vfloat32 = __attribute__((altivec(vector__))) float;
 using vfloat64 = __attribute__((altivec(vector__))) double;
 #endif

+inline auto make_vuint(vint8 v) {
+  return reinterpret_cast<vuint8>(v);
+}
+inline auto make_vuint(vint16 v) {
+  return reinterpret_cast<vuint16>(v);
+}
+inline auto make_vuint(vint32 v) {
+  return reinterpret_cast<vuint32>(v);
+}
+inline auto make_vuint(vint64 v) {
+  return reinterpret_cast<vuint64>(v);
+}
+
 #if !defined(vec_float)
 C10_ALWAYS_INLINE vfloat32 vec_float(const vint32& vec_in) {
  vfloat32 vec_out;
@ -521,6 +535,42 @@ const vfloat64 vd_imag_half = vfloat64{0.0, 0.5};
 const vfloat64 vd_sqrt2_2 = vfloat64{0.70710678118654757, 0.70710678118654757};
 const vfloat64 vd_pi_2 = vfloat64{M_PI / 2.0, 0.0};

+template <typename T>
+Vectorized<T> VsxShiftRightArith(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  const Vectorized<T> max_shift(sizeof(T) * CHAR_BIT - std::is_signed_v<T>);
+  const auto mask = (b < Vectorized<T>(0)) | (b >= max_shift);
+  const auto shift = Vectorized<T>::blendv(b, max_shift, mask);
+  return Vectorized<T>{
+      vec_sra(a.vec0(), make_vuint(shift.vec0())),
+      vec_sra(a.vec1(), make_vuint(shift.vec1()))};
+}
+
+template <typename T>
+Vectorized<T> VsxShiftLeftArith(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  const Vectorized<T> max_shift(sizeof(T) * CHAR_BIT);
+  const auto mask = (b < Vectorized<T>(0)) | (b >= max_shift);
+  Vectorized<T> ret(
+      vec_sl(a.vec0(), make_vuint(b.vec0())),
+      vec_sl(a.vec1(), make_vuint(b.vec1())));
+  return Vectorized<T>::blendv(ret, Vectorized<T>(0), mask);
+}
+
+#define DEFINE_SHIFT_FUNCS(operand_type)                                      \
+  template <>                                                                 \
+  Vectorized<operand_type> C10_ALWAYS_INLINE operator>>(                      \
+      const Vectorized<operand_type>& a, const Vectorized<operand_type>& b) { \
+    return VsxShiftRightArith(a, b);                                          \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<operand_type> C10_ALWAYS_INLINE operator<<(                      \
+      const Vectorized<operand_type>& a, const Vectorized<operand_type>& b) { \
+    return VsxShiftLeftArith(a, b);                                           \
+  }
+
 } // namespace CPU_CAPABILITY
 } // namespace vec
 } // namespace at
--- a/aten/src/ATen/cpu/vec/vec_quant.h
+++ b/aten/src/ATen/cpu/vec/vec_quant.h
@ -0,0 +1,153 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <c10/util/Exception.h>
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+// Transpose a [4, 64] block to [64, 4] (with contiguous output, ld=4)
+template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
+static inline void transpose_pad_4x64_block(
+    const scalar_t* src,
+    scalar_t* dst,
+    int64_t ld_src,
+    int krem = 4,
+    int nrem = 64) {
+#if defined(CPU_CAPABILITY_AVX512)
+  __m512i r[4];
+  // Load with mask if partial
+  if (nrem < 64) {
+    __mmask64 mask = (1ULL << nrem) - 1;
+    for (int i = 0; i < krem; ++i) {
+      r[i] = _mm512_maskz_loadu_epi8(mask, src + i * ld_src);
+    }
+    for (int i = krem; i < 4; ++i) {
+      r[i] = _mm512_setzero_si512();
+    }
+  } else {
+    for (int i = 0; i < krem; ++i) {
+      r[i] = _mm512_loadu_si512(
+          reinterpret_cast<const __m512i*>(src + i * ld_src));
+    }
+    for (int i = krem; i < 4; ++i) {
+      r[i] = _mm512_setzero_si512();
+    }
+  }
+
+  // Transpose 4x64 bytes using unpack and shuffle
+  __m512i t0 = _mm512_unpacklo_epi8(r[0], r[1]);
+  __m512i t1 = _mm512_unpackhi_epi8(r[0], r[1]);
+  __m512i t2 = _mm512_unpacklo_epi8(r[2], r[3]);
+  __m512i t3 = _mm512_unpackhi_epi8(r[2], r[3]);
+
+  __m512i u0 = _mm512_unpacklo_epi16(t0, t2);
+  __m512i u1 = _mm512_unpackhi_epi16(t0, t2);
+  __m512i u2 = _mm512_unpacklo_epi16(t1, t3);
+  __m512i u3 = _mm512_unpackhi_epi16(t1, t3);
+
+  __m512i v0 = _mm512_shuffle_i32x4(u0, u1, 0x88);
+  __m512i v1 = _mm512_shuffle_i32x4(u0, u1, 0xdd);
+  __m512i v2 = _mm512_shuffle_i32x4(u2, u3, 0x88);
+  __m512i v3 = _mm512_shuffle_i32x4(u2, u3, 0xdd);
+
+  __m512i r0 = _mm512_shuffle_i32x4(v0, v2, 0x88);
+  __m512i r1 = _mm512_shuffle_i32x4(v1, v3, 0x88);
+  __m512i r2 = _mm512_shuffle_i32x4(v0, v2, 0xdd);
+  __m512i r3 = _mm512_shuffle_i32x4(v1, v3, 0xdd);
+
+  // Store output
+  if (nrem < 16) {
+    __mmask64 mask = (1ULL << (nrem * 4)) - 1;
+    _mm512_mask_storeu_epi8(dst, mask, r0);
+  } else if (nrem == 16) {
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0);
+  } else if (nrem < 32) {
+    int n_bytes1 = 64;
+    int n_bytes2 = (nrem * 4) - n_bytes1;
+    __mmask64 mask = (1ULL << n_bytes2) - 1;
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0);
+    _mm512_mask_storeu_epi8(reinterpret_cast<__m512i*>(dst + 64), mask, r1);
+  } else if (nrem == 32) {
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1);
+  } else if (nrem < 48) {
+    int n_bytes1 = 64 * 2;
+    int n_bytes2 = (nrem * 4) - n_bytes1;
+    __mmask64 mask = (1ULL << n_bytes2) - 1;
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1);
+    _mm512_mask_storeu_epi8(reinterpret_cast<__m512i*>(dst + 64 * 2), mask, r2);
+  } else if (nrem == 48) {
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64 * 2), r2);
+  } else if (nrem < 64) {
+    int n_bytes1 = 64 * 3;
+    int n_bytes2 = (nrem * 4) - n_bytes1;
+    __mmask64 mask = (1ULL << n_bytes2) - 1;
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64 * 2), r2);
+    _mm512_mask_storeu_epi8(reinterpret_cast<__m512i*>(dst + 64 * 3), mask, r3);
+  } else {
+    // normal case, nrem == 64
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64 * 2), r2);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64 * 3), r3);
+  }
+#else
+  TORCH_CHECK(
+      false,
+      "transpose_pad_4x64_block is only supported when AVX-512 is supported")
+#endif
+}
+
+// Reorder [K, N] → [K/4, N, 4] (VNNI4-style layout for bit8)
+template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
+static inline void pack_vnni4(
+    const scalar_t* src,
+    scalar_t* dst,
+    int64_t ld_src,
+    int64_t K,
+    int64_t N) {
+#if defined(CPU_CAPABILITY_AVX512)
+  int64_t bk = 0;
+  int64_t _K = K / 4 * 4;
+  int64_t _N = N / 64 * 64;
+  for (; bk < _K; bk += 4) {
+    int64_t bn = 0;
+    for (; bn < _N; bn += 64) {
+      transpose_pad_4x64_block(
+          src + bk * ld_src + bn, dst + bk * N + bn * 4, ld_src);
+    }
+    int64_t nrem = N - bn;
+    if (nrem > 0) {
+      transpose_pad_4x64_block(
+          src + bk * ld_src + bn, dst + bk * N + bn * 4, ld_src, 4, nrem);
+    }
+  }
+
+  // Handle leftover K rows (< 4)
+  if (K % 4 != 0) {
+    int krem = K - bk;
+    int64_t bn = 0;
+    for (; bn < _N; bn += 64) {
+      transpose_pad_4x64_block(
+          src + bk * ld_src + bn, dst + bk * N + bn * 4, ld_src, krem);
+    }
+    int64_t nrem = N - bn;
+    if (nrem > 0) {
+      transpose_pad_4x64_block(
+          src + bk * ld_src + bn, dst + bk * N + bn * 4, ld_src, krem, nrem);
+    }
+  }
+#else
+  TORCH_CHECK(false, "pack_vnni4 is only supported when AVX-512 is supported")
+#endif
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
--- a/aten/src/ATen/cuda/tunable/README.md
+++ b/aten/src/ATen/cuda/tunable/README.md
@ -154,7 +154,7 @@ programmatically since the settings become fixed. Use the C++ or Python APIs ins
 | PYTORCH_TUNABLEOP_MAX_WARMUP_ITERATIONS | Default is 0, meaning it is not used. |
 | PYTORCH_TUNABLEOP_ICACHE_FLUSH_ENABLED | Default is 1. Set to 0 to disable. |
 | PYTORCH_TUNABLEOP_ROTATING_BUFFER_SIZE | Default (or < 0) is to query L2 cache size. Set to 0 to disable. Otherwise, set to the number of MiB to use for the pool of operator parameters. For example, setting this to the size of your device's memory cache will guarantee that every tuning iteration will use a cold cache. |
-| PYTORCH_TUNABLEOP_BLAS_LOG | Default is 0. Set to 1 to enable. Write BLAS paramters to tuning CSV file. |
+| PYTORCH_TUNABLEOP_BLAS_LOG | Default is 0. Set to 1 to enable. Write BLAS parameters to tuning CSV file. |

 ### Python Interface
 All python APIs exist in the `torch.cuda.tunable` module.
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@ -697,7 +697,7 @@ TORCH_META_FUNC(linalg_cholesky_ex)(const Tensor& A,
  auto ndim = A_shape.size();

  // L
-  auto L_strides = at::native::batched_matrix_contiguous_strides(A_shape, /*f-contig*=*/A.device().type() != at::kMPS);
+  auto L_strides = at::native::batched_matrix_contiguous_strides(A_shape, /*f-contig*=*/true);
  set_output_strided(0, A_shape, L_strides, A.options(), {});

  // info
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@ -127,6 +127,9 @@ TORCH_IMPL_FUNC(smooth_l1_loss_out)

 TORCH_IMPL_FUNC(mse_loss_out)
 (const Tensor& input, const Tensor& target, int64_t reduction, const Tensor& result) {
+  TORCH_CHECK(input.device() == target.device(),
+      "Expected all tensors to be on the same device, but found at least two devices, ",
+      input.device(), " and ", target.device(), "!");
  if (reduction != Reduction::None) {
    Tensor loss;
    auto iter = TensorIterator::borrowing_binary_op(loss, input, target);
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@ -2862,7 +2862,7 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_t_forward(T x, int64_t n) {
    T q = x;
    T r;

-    for (int64_t k = 2; k <= n; k++) {
+    for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
        r = (x + x) * q - p;
        p = q;
        q = r;
@ -2910,7 +2910,7 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_u_forward(T x, int64_t n) {
    T q = x + x;
    T r;

-    for (int64_t k = 2; k <= n; k++) {
+    for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
        r = (x + x) * q - p;
        p = q;
        q = r;
@ -2966,7 +2966,7 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_v_forward(T x, int64_t n) {
    T q = x + x - T(1.0);
    T r;

-    for (int64_t k = 2; k <= n; k++) {
+    for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
        r = (x + x) * q - p;
        p = q;
        q = r;
@ -3026,7 +3026,7 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_w_forward(T x, int64_t n) {
    T q = x + x + T(1.0);
    T r;

-    for (int64_t k = 2; k <= n; k++) {
+    for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
        r = (x + x) * q - p;
        p = q;
        q = r;
@ -3150,7 +3150,7 @@ inline C10_HOST_DEVICE T laguerre_polynomial_l_forward(T x, int64_t n) {
    T q = T(1.0) - x;
    T r;

-    for (int64_t k = 1; k < n; k++) {
+    for (int64_t k = 1; (k < n) && !std::isnan(q); k++) {
        r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1);
        p = q;
        q = r;
@ -3190,7 +3190,7 @@ inline C10_HOST_DEVICE T legendre_polynomial_p_forward(T x, int64_t n) {
    T q = x;
    T r;

-    for (int64_t k = 1; k < n; k++) {
+    for (int64_t k = 1; (k < n) && !std::isnan(q); k++) {
        r = ((k + k + 1) * x * q - k * p) / (k + 1);
        p = q;
        q = r;
@ -3733,7 +3733,7 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_t_forward(T x, int64_t n)
    T q = x + x - T(1.0);
    T r;

-    for (int64_t k = 2; k <= n; k++) {
+    for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
        r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
        p = q;
        q = r;
@ -3785,7 +3785,7 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_u_forward(T x, int64_t n)
    T q = x + x - T(1.0) + (x + x - T(1.0));
    T r;

-    for (int64_t k = 2; k <= n; k++) {
+    for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
        r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
        p = q;
        q = r;
@ -3841,7 +3841,7 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_v_forward(T x, int64_t n)
    T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
    T r;

-    for (int64_t k = 2; k <= n; k++) {
+    for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
        r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
        p = q;
        q = r;
@ -3897,7 +3897,7 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_w_forward(T x, int64_t n)
    T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
    T r;

-    for (int64_t k = 2; k <= n; k++) {
+    for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
        r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
        p = q;
        q = r;
--- a/aten/src/ATen/native/cpu/utils.h
+++ b/aten/src/ATen/native/cpu/utils.h
@ -165,6 +165,12 @@ inline void transpose<uint16_t>(int64_t M, int64_t N, const uint16_t* src, int64
  TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
  fbgemm::transpose_simd<uint16_t>(M, N, src, ld_src, dst, ld_dst);
 }
+
+template <>
+inline void transpose<uint8_t>(int64_t M, int64_t N, const uint8_t* src, int64_t ld_src, uint8_t* dst, int64_t ld_dst) {
+  TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
+  fbgemm::transpose_simd<uint8_t>(M, N, src, ld_src, dst, ld_dst);
+}
 #endif

 template <typename index_t, typename F>
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@ -369,7 +369,7 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices,

    int warp_size = at::cuda::warp_size();
    TORCH_INTERNAL_ASSERT(num_threads() % warp_size == 0 &&
-                  num_threads() <= cuda_utils::kCUDABlockReduceMaxThreads(),
+                  num_threads() <= static_cast<uint32_t>(cuda_utils::kCUDABlockReduceMaxThreads()),
                  "BlockReduceSum requires all warps be active");
    const int64_t *num_unique_indices_ptr = num_unique_indices.const_data_ptr<int64_t>();
    dim3 grid = unique_indices.numel();
--- a/aten/src/ATen/native/cuda/Math.cuh
+++ b/aten/src/ATen/native/cuda/Math.cuh
@ -1946,7 +1946,7 @@ const auto chebyshev_polynomial_t_string = jiterator_stringify(
        T q = x;
        T r;

-        for (int64_t k = 2; k <= n; k++) {
+        for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
            r = (x + x) * q - p;
            p = q;
            q = r;
@ -1996,7 +1996,7 @@ const auto chebyshev_polynomial_u_string = jiterator_stringify(
        T q = x + x;
        T r;

-        for (int64_t k = 2; k <= n; k++) {
+        for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
            r = (x + x) * q - p;
            p = q;
            q = r;
@ -2054,7 +2054,7 @@ const auto chebyshev_polynomial_v_string = jiterator_stringify(
        T q = x + x - T(1.0);
        T r;

-        for (int64_t k = 2; k <= n; k++) {
+        for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
            r = (x + x) * q - p;
            p = q;
            q = r;
@ -2116,7 +2116,7 @@ const auto chebyshev_polynomial_w_string = jiterator_stringify(
        T q = x + x + T(1.0);
        T r;

-        for (int64_t k = 2; k <= n; k++) {
+        for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
            r = (x + x) * q - p;
            p = q;
            q = r;
@ -2252,7 +2252,7 @@ const auto laguerre_polynomial_l_string = jiterator_stringify(
        T q = T(1.0) - x;
        T r;

-        for (int64_t k = 1; k < n; k++) {
+        for (int64_t k = 1; (k < n) && !isnan(q); k++) {
            r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1);
            p = q;
            q = r;
@ -2294,7 +2294,7 @@ const auto legendre_polynomial_p_string = jiterator_stringify(
        T q = x;
        T r;

-        for (int64_t k = 1; k < n; k++) {
+        for (int64_t k = 1; (k < n) && !isnan(q); k++) {
            r = ((k + k + 1) * x * q - k * p) / (k + 1);
            p = q;
            q = r;
@ -2851,7 +2851,7 @@ const auto shifted_chebyshev_polynomial_t_string = jiterator_stringify(
        T q = x + x - T(1.0);
        T r;

-        for (int64_t k = 2; k <= n; k++) {
+        for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
            p = q;
            q = r;
@ -2905,7 +2905,7 @@ const auto shifted_chebyshev_polynomial_u_string = jiterator_stringify(
        T q = x + x - T(1.0) + (x + x - T(1.0));
        T r;

-        for (int64_t k = 2; k <= n; k++) {
+        for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
            p = q;
            q = r;
@ -2963,7 +2963,7 @@ const auto shifted_chebyshev_polynomial_v_string = jiterator_stringify(
        T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
        T r;

-        for (int64_t k = 2; k <= n; k++) {
+        for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
            p = q;
            q = r;
@ -3021,7 +3021,7 @@ const auto shifted_chebyshev_polynomial_w_string = jiterator_stringify(
        T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
        T r;

-        for (int64_t k = 2; k <= n; k++) {
+        for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
            p = q;
            q = r;
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@ -155,6 +155,12 @@ static void check_shape_forward(const Tensor& input,
 //  but weight/bias and grad_weight/grad_bias are always CPU tensor.
 //

+static bool mkldnn_conv_enabled_fpmath_mode_bf16(){
+  return at::globalContext().float32Precision("mkldnn", "conv") == "bf16" &&
+      mkldnn_bf16_device_check();
+}
+
+
 static inline at::MemoryFormat mkldnn_convolution_memory_format(int64_t dims, bool is_channels_last) {
   auto memory_format =  at::MemoryFormat::Contiguous;
   if (is_channels_last) {
@ -163,7 +169,7 @@ static inline at::MemoryFormat mkldnn_convolution_memory_format(int64_t dims, bo
   return memory_format;
 }

-static void _mkldnn_convolution_out (
+static void _mkldnn_convolution_out(
    const Tensor& input_t,
    const Tensor& weight_t,
    const Tensor& bias,
@ -261,6 +267,10 @@ static Tensor _mkldnn_convolution(
    output.resize_(output_sizes, memory_format);
    y = itensor_from_tensor(output);
  }
+  if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
+      input_t.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
+  }
  _mkldnn_convolution_out(
      input_t,
      weight_t,
@ -442,6 +452,10 @@ Tensor mkldnn_convolution_pointwise_binary(
    op_attr.set_post_ops(po);
    auto aprop_kind = ideep::prop_kind::forward_inference;

+    if (mkldnn_conv_enabled_fpmath_mode_bf16() && input_t.scalar_type() ==at::kFloat){
+      op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
+    }
+
    if (bias.defined()) {
      const ideep::tensor b = itensor_from_tensor(bias);
      ideep::convolution_forward::compute_binary(
@ -579,6 +593,10 @@ Tensor& mkldnn_convolution_pointwise_binary_(
      op_attr = ideep::attr_t::fuse_sum();
    }
    auto aprop_kind = ideep::prop_kind::forward_inference;
+    if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
+        input_t.scalar_type() == at::kFloat) {
+      op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
+    }
    _mkldnn_convolution_out(
        input_t,
        weight_t,
@ -697,6 +715,10 @@ Tensor _mkldnn_convolution_transpose(
    y = itensor_from_tensor(output);
  }

+  if (mkldnn_conv_enabled_fpmath_mode_bf16() && input_t.scalar_type() ==at::kFloat){
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
+  }
+
  if (bias.defined()) {
    const ideep::tensor b = itensor_from_tensor(bias, /*from_const_data_ptr*/true);
    ideep::convolution_transpose_forward::compute_v3(
@ -781,6 +803,11 @@ Tensor mkldnn_convolution_backward_input(
    grad_input.resize_(input_size, memory_format);
    grad_x = itensor_from_tensor(grad_input);
  }
+  ideep::attr_t op_attr = ideep::attr_t();
+  if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
+      weight.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
+  }
  ideep::convolution_backward_data::compute_v2(
      grad_y,
      w,
@ -791,7 +818,17 @@ Tensor mkldnn_convolution_backward_input(
      padding.vec(),
      padding.vec(),
      groups,
+#if IDEEP_PREREQ(3, 4, 1, 3)
+      is_channels_last,
+      op_attr);
+#else
      is_channels_last);
+  if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
+      weight.scalar_type() == at::kFloat) {
+    TORCH_WARN_ONCE(
+        "Unexpected ideep version to support fpmath_mode_bf16, please update ideep version to align with pytorch main branch");
+      }
+#endif

  if (grad_output.is_mkldnn()) {
    return MKLDNNTensor(grad_x, grad_output.options());
@ -816,6 +853,11 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
  const ideep::tensor x = itensor_from_tensor(input, /*from_const_data_ptr*/true);

  ideep::tensor grad_w, grad_b;
+  ideep::attr_t op_attr = ideep::attr_t();
+  if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
+      input.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
+  }
  if (bias_defined) {
    ideep::convolution_backward_weights::compute_v2(
        x,
@ -828,7 +870,8 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
        padding.vec(),
        padding.vec(),
        groups,
-        is_channels_last);
+        is_channels_last,
+        op_attr);
  } else {
    ideep::convolution_backward_weights::compute_v2(
        x,
@ -840,7 +883,8 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
        padding.vec(),
        padding.vec(),
        groups,
-        is_channels_last);
+        is_channels_last,
+        op_attr);
  }

  if (!is_channels_last) {
@ -962,6 +1006,11 @@ Tensor mkldnn_convolution_transpose_backward_input(
    grad_input.resize_(input_size, memory_format);
    grad_x = itensor_from_tensor(grad_input);
  }
+  ideep::attr_t op_attr = ideep::attr_t();
+  if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
+      weight.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
+  }
  ideep::convolution_transpose_backward_data::compute_v3(
      grad_y,
      w,
@ -972,7 +1021,8 @@ Tensor mkldnn_convolution_transpose_backward_input(
      padding_r(padding, output_padding),
      dilation.vec(),
      groups,
-      is_channels_last);
+      is_channels_last,
+      op_attr);

  if (grad_output.is_mkldnn()) {
    return MKLDNNTensor(grad_x, grad_output.options());
@ -998,6 +1048,11 @@ std::tuple<Tensor,Tensor> mkldnn_convolution_transpose_backward_weights(
  auto x = itensor_from_tensor(input, /*from_const_data_ptr*/true);

  ideep::tensor grad_w, grad_b;
+  ideep::attr_t op_attr = ideep::attr_t();
+  if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
+      input.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
+  }
  if (bias_defined) {
    ideep::convolution_transpose_backward_weights::compute_v3(
        x,
@ -1010,7 +1065,8 @@ std::tuple<Tensor,Tensor> mkldnn_convolution_transpose_backward_weights(
        padding_r(padding, output_padding),
        dilation.vec(),
        groups,
-        is_channels_last);
+        is_channels_last,
+        op_attr);
  } else {
    ideep::convolution_transpose_backward_weights::compute_v3(
        x,
@ -1022,7 +1078,8 @@ std::tuple<Tensor,Tensor> mkldnn_convolution_transpose_backward_weights(
        padding_r(padding, output_padding),
        dilation.vec(),
        groups,
-        is_channels_last);
+        is_channels_last,
+        op_attr);
  }

  if (!is_channels_last) {
--- a/aten/src/ATen/native/mkldnn/Linear.cpp
+++ b/aten/src/ATen/native/mkldnn/Linear.cpp
@ -68,6 +68,11 @@ mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,

 namespace at::native {

+static bool use_mkldnn_bf32_linear() {
+  return at::globalContext().float32Precision("mkldnn", "matmul") == "bf16" &&
+      mkldnn_bf16_device_check();
+}
+
 Tensor mkldnn_linear(
    const Tensor& self,
    const Tensor& weight_t, const std::optional<Tensor>& bias_opt) {
@ -251,7 +256,9 @@ Tensor mkldnn_linear_pointwise(
        it != fusion_unary_attr_map().end(), "Fusion behavior undefined.");
    op_attr = it->second(scalars, algorithm);
  }
-
+  if (use_mkldnn_bf32_linear() && input_t.scalar_type() == at::kFloat){
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
+  }
  if (mkldnn_bias.has_value()) {
    ideep::inner_product_forward::compute</*reorder_src=*/false, /*reorder_weight=*/false>(
        mkldnn_input,
@ -341,6 +348,10 @@ Tensor mkldnn_linear_pointwise_binary(
  auto op_attr = ideep::attr_t::fuse_binary(it_binary->second, other_desc);
  auto aprop_kind = ideep::prop_kind::forward_inference;

+  if (use_mkldnn_bf32_linear() && input_t.scalar_type() == at::kFloat){
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
+  }
+
  if (mkldnn_bias.has_value()) {
    ideep::inner_product_forward::compute_binary</*reorder_src=*/false, /*reorder_weight=*/false>(
        mkldnn_input,
--- a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
@ -134,6 +134,58 @@ struct chebyshev_polynomial_w_functor {
  }
 };

+struct shifted_chebyshev_polynomial_t_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(
+        c10::metal::shifted_chebyshev_polynomial_t_forward(a, b));
+  }
+  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::shifted_chebyshev_polynomial_t_forward(
+        float(a), float(b));
+  }
+};
+
+struct shifted_chebyshev_polynomial_u_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(
+        c10::metal::shifted_chebyshev_polynomial_u_forward(a, b));
+  }
+  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::shifted_chebyshev_polynomial_u_forward(
+        float(a), float(b));
+  }
+};
+
+struct shifted_chebyshev_polynomial_v_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(
+        c10::metal::shifted_chebyshev_polynomial_v_forward(a, b));
+  }
+  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::shifted_chebyshev_polynomial_v_forward(
+        float(a), float(b));
+  }
+};
+
+struct shifted_chebyshev_polynomial_w_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(
+        c10::metal::shifted_chebyshev_polynomial_w_forward(a, b));
+  }
+  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::shifted_chebyshev_polynomial_w_forward(
+        float(a), float(b));
+  }
+};
+
 struct hermite_polynomial_h_functor {
  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
  inline T operator()(const T a, const T b) {
@ -342,6 +394,14 @@ REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_v);
 REGISTER_INT2FLOAT_BINARY_OP(chebyshev_polynomial_w);
 REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_w);
 REGISTER_INT2FLOAT_BINARY_OP(chebyshev_polynomial_v);
+REGISTER_FLOAT_BINARY_OP(shifted_chebyshev_polynomial_t);
+REGISTER_INT2FLOAT_BINARY_OP(shifted_chebyshev_polynomial_t);
+REGISTER_FLOAT_BINARY_OP(shifted_chebyshev_polynomial_u);
+REGISTER_INT2FLOAT_BINARY_OP(shifted_chebyshev_polynomial_u);
+REGISTER_FLOAT_BINARY_OP(shifted_chebyshev_polynomial_v);
+REGISTER_INT2FLOAT_BINARY_OP(shifted_chebyshev_polynomial_v);
+REGISTER_FLOAT_BINARY_OP(shifted_chebyshev_polynomial_w);
+REGISTER_INT2FLOAT_BINARY_OP(shifted_chebyshev_polynomial_w);
 REGISTER_FLOAT_BINARY_OP(hermite_polynomial_h);
 REGISTER_INT2FLOAT_BINARY_OP(hermite_polynomial_h);
 REGISTER_FLOAT_BINARY_OP(hermite_polynomial_he);
--- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
+++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
@ -145,6 +145,28 @@ inline float blockReduceSum(
  return sharedScratch[0];
 }

+template <bool col_major>
+inline device float& get_ref(device float* A, uint row, uint col, uint N);
+
+template <>
+inline device float& get_ref<true>(
+    device float* A,
+    uint row,
+    uint col,
+    uint N) {
+  return A[row * N + col];
+}
+
+template <>
+inline device float& get_ref<false>(
+    device float* A,
+    uint row,
+    uint col,
+    uint N) {
+  return A[row + col * N];
+}
+
+template <bool upper>
 kernel void factorDiagonalBlock(
    device float* A [[buffer(0)]],
    device int* info [[buffer(1)]],
@ -171,7 +193,7 @@ kernel void factorDiagonalBlock(
  for (uint i = linear_tid; i < tileSize; i += group_size) {
    uint r = i / actSize;
    uint c = i % actSize;
-    tile[r][c] = A[batch_offset + (row0 + r) * N + (col0 + c)];
+    tile[r][c] = get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N);
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);

@ -244,10 +266,33 @@ kernel void factorDiagonalBlock(
  for (uint i = linear_tid; i < tileSize; i += group_size) {
    uint r = i / actSize;
    uint c = i % actSize;
-    A[batch_offset + (row0 + r) * N + (col0 + c)] = tile[r][c];
+    get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N) = tile[r][c];
  }
 }

+template [[host_name("factorDiagonalBlockU")]]
+kernel void factorDiagonalBlock<true>(
+    device float* A [[buffer(0)]],
+    device int* info [[buffer(1)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 bid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template [[host_name("factorDiagonalBlockL")]]
+kernel void factorDiagonalBlock<false>(
+    device float* A [[buffer(0)]],
+    device int* info [[buffer(1)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 bid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template <bool upper>
 kernel void applyTRSM(
    device float* A [[buffer(0)]],
    constant uint& N [[buffer(2)]],
@ -283,12 +328,12 @@ kernel void applyTRSM(
  for (uint i = linear_tid; i < actSize_k * actSize_k; i += group_size) {
    uint r = i / actSize_k;
    uint c = i % actSize_k;
-    diag[i] = A[batch_offset + (k * NB + r) * N + (k * NB + c)];
+    diag[i] = get_ref<upper>(A + batch_offset, k * NB + r, k * NB + c, N);
  }
  for (uint i = linear_tid; i < actSize_j * actSize_k; i += group_size) {
    uint r = i / actSize_k;
    uint c = i % actSize_k;
-    target[i] = A[batch_offset + (row0 + r) * N + (col0 + c)];
+    target[i] = get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N);
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);

@ -332,10 +377,31 @@ kernel void applyTRSM(
  for (uint i = linear_tid; i < actSize_j * actSize_k; i += group_size) {
    uint r = i / actSize_k;
    uint c = i % actSize_k;
-    A[batch_offset + (row0 + r) * N + (col0 + c)] = target[i];
+    get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N) = target[i];
  }
 }

+template [[host_name("applyTRSMU")]]
+kernel void applyTRSM<true>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template [[host_name("applyTRSML")]]
+kernel void applyTRSM<false>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template <bool upper>
 kernel void applySYRK(
    device float* A [[buffer(0)]],
    constant uint& N [[buffer(2)]],
@ -403,17 +469,25 @@ kernel void applySYRK(
      // Same logic to load/store Cfrag, Afrag, Bfrag...
      simdgroup_matrix<float, 8, 8> Cfrag;
      simdgroup_load(
-          Cfrag, &A[batch_offset + (row0 + sb_y) * N + (col0 + sb_x)], N);
+          Cfrag,
+          &get_ref<upper>(A + batch_offset, row0 + sb_y, col0 + sb_x, N),
+          N,
+          0,
+          !upper);

      for (uint kk = 0; kk < actSize_k; kk += 8) {
        simdgroup_load(
-            Afrag, &A[batch_offset + (row0 + sb_y) * N + (k * NB + kk)], N);
+            Afrag,
+            &get_ref<upper>(A + batch_offset, row0 + sb_y, k * NB + kk, N),
+            N,
+            0,
+            !upper);
        simdgroup_load(
            Bfrag,
-            &A[batch_offset + (col0 + sb_x) * N + (k * NB + kk)],
+            &get_ref<upper>(A + batch_offset, col0 + sb_x, k * NB + kk, N),
            N,
            /* matrix_origin = */ 0,
-            /* transpose = */ true);
+            /* transpose = */ upper);

        simdgroup_multiply(Prod, Afrag, Bfrag);
        simdgroup_multiply(Prod, Prod, negative_identity);
@ -421,7 +495,11 @@ kernel void applySYRK(
      }

      simdgroup_store(
-          Cfrag, &A[batch_offset + (row0 + sb_y) * N + (col0 + sb_x)], N);
+          Cfrag,
+          &get_ref<upper>(A + batch_offset, row0 + sb_y, col0 + sb_x, N),
+          N,
+          0,
+          !upper);
    }
  } else {
    // Fallback for non-multiple-of-8 dimensions
@ -442,8 +520,10 @@ kernel void applySYRK(

        float sum = 0.0f;
        for (uint i = 0; i < actSize_k; i++) {
-          float a_val = A[batch_offset + (row0 + y) * N + k * NB + i];
-          float b_val = A[batch_offset + (col0 + x) * N + k * NB + i];
+          float a_val =
+              get_ref<upper>(A + batch_offset, row0 + y, k * NB + i, N);
+          float b_val =
+              get_ref<upper>(A + batch_offset, col0 + x, k * NB + i, N);
          sum = fma(a_val, b_val, sum);
        }
        sum_accumulator[y * tpg.x + x] += sum;
@ -452,13 +532,35 @@ kernel void applySYRK(
    threadgroup_barrier(mem_flags::mem_threadgroup);
    for (uint y = ty; y < actSize_j; y += tpg.y) {
      for (uint x = tx; x < actSize_h; x += tpg.x) {
-        A[batch_offset + (row0 + y) * N + col0 + x] -=
+        get_ref<upper>(A + batch_offset, row0 + y, col0 + x, N) -=
            sum_accumulator[y * tpg.x + x];
      }
    }
  }
 }

+template [[host_name("applySYRKU")]]
+kernel void applySYRK<true>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]],
+    uint sgitg [[simdgroup_index_in_threadgroup]]);
+
+template [[host_name("applySYRKL")]]
+kernel void applySYRK<false>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]],
+    uint sgitg [[simdgroup_index_in_threadgroup]]);
+
 kernel void applyPivots(
    device float* P [[buffer(0)]],
    device const int* pivots [[buffer(1)]],
--- a/aten/src/ATen/native/mps/kernels/Pooling.h
+++ b/aten/src/ATen/native/mps/kernels/Pooling.h
@ -27,3 +27,14 @@ struct PoolingParams {
  _ARRAY_NS::array<int64_t, N - 2> padding;
  _ARRAY_NS::array<int64_t, N - 2> dilation;
 };
+
+template <unsigned N = 5>
+struct PoolingBackwardParams {
+  int32_t dims;
+  int32_t pooling_dims;
+  _ARRAY_NS::array<int64_t, N> grad_input_sizes;
+  _ARRAY_NS::array<int64_t, N> grad_input_strides;
+  _ARRAY_NS::array<int64_t, N> grad_output_sizes;
+  _ARRAY_NS::array<int64_t, N> grad_output_strides;
+  _ARRAY_NS::array<int64_t, N> indices_strides;
+};
--- a/aten/src/ATen/native/mps/kernels/Pooling.metal
+++ b/aten/src/ATen/native/mps/kernels/Pooling.metal
@ -1,7 +1,10 @@
 #include <ATen/native/mps/kernels/Pooling.h>
+#include <c10/metal/atomic.h>
 #include <metal_array>
 #include <metal_stdlib>
+
 using namespace metal;
+using namespace c10::metal;

 // Iterates through all the input elements that this kernel needs to
 // apply max to. Specialized for 3 pooling dimensions.
@ -83,6 +86,50 @@ void max_pool_3d_input_iter(
  *indices = max_index;
 }

+struct PoolOffsets {
+  int64_t output;
+  int64_t indices;
+  int64_t input_leading;
+
+  PoolOffsets() : output(0), indices(0), input_leading(0) {}
+};
+
+// Finds the offset of the output element that a forward pass thread will
+// calculate, `output[N, C, d, h, w]`. Also, find the offset of the input for
+// the leading dim indices, `input[N, C]`. Optionally, keep track of the output
+// pooling dimension indices, `[d, h , w]`.
+PoolOffsets find_pool_offsets(
+    constant int64_t* output_sizes,
+    constant int64_t* output_strides,
+    constant int64_t* indices_strides,
+    constant int64_t* input_strides,
+    device int64_t* work_pooling_dim_indices,
+    int32_t dims,
+    int32_t leading_dims,
+    uint tid) {
+  int64_t output_idx = static_cast<int64_t>(tid);
+  PoolOffsets offsets;
+
+  for (int64_t dim = dims - 1; dim >= 0; dim--) {
+    int64_t dim_idx = output_idx % (output_sizes[dim]);
+    offsets.output += output_strides[dim] * dim_idx;
+    offsets.indices += indices_strides[dim] * dim_idx;
+
+    if (dim < leading_dims) {
+      offsets.input_leading += input_strides[dim] * dim_idx;
+    } else {
+      // Keep track of pooling dimension indices of the output element, so we
+      // can use them in the input iteration later on.
+      if (work_pooling_dim_indices != nullptr) {
+        work_pooling_dim_indices[dim - leading_dims] = dim_idx;
+      }
+    }
+    output_idx = output_idx / output_sizes[dim];
+  }
+
+  return offsets;
+}
+
 // Kernel computes one element of the output per kernel call.
 template <typename T>
 kernel void max_pool(
@ -113,32 +160,20 @@ kernel void max_pool(
  // element of the output. We need to fill it with the proper values below.
  device int64_t* work_pooling_dim_indices =
      work_pooling_dim_indices_ + tid * pooling_dims;
-  int64_t output_idx = static_cast<int64_t>(tid);
-  int64_t output_offset = 0;
-  int64_t indices_offset = 0;
-  int64_t input_leading_offset = 0;

-  // First, find the offset of the output element this thread will calculate,
-  // `output[N, C, d, h, w]`. Also, find the offset of the input for the leading
-  // dim indices, `input[N, C]` and keep track of the pooling dimension indices,
-  // `[d, h , w]`.
-  for (int64_t dim = dims - 1; dim >= 0; dim--) {
-    int64_t dim_idx = output_idx % (output_sizes[dim]);
-    output_offset += output_strides[dim] * dim_idx;
-    indices_offset += indices_strides[dim] * dim_idx;
+  PoolOffsets offsets = find_pool_offsets(
+      output_sizes,
+      output_strides,
+      indices_strides,
+      input_strides,
+      work_pooling_dim_indices,
+      dims,
+      leading_dims,
+      tid);

-    if (dim < leading_dims) {
-      input_leading_offset += input_strides[dim] * dim_idx;
-    } else {
-      // Keep track of pooling dimension indices of the output element, so we
-      // can use them in the input iteration later on.
-      work_pooling_dim_indices[dim - leading_dims] = dim_idx;
-    }
-    output_idx = output_idx / output_sizes[dim];
-  }
-  output += output_offset;
-  indices += indices_offset;
-  input += input_leading_offset;
+  output += offsets.output;
+  indices += offsets.indices;
+  input += offsets.input_leading;

  max_pool_3d_input_iter<T>(
      input,
@ -153,6 +188,69 @@ kernel void max_pool(
      dilation);
 }

+// Finds the element in the grad input which corresponds to the index into the
+// pool, and then adds the grad output element to it.
+template <typename T>
+void max_pool_backward_impl(
+    device AtomicType_t<T>* grad_input,
+    T grad_output_element,
+    int32_t input_index,
+    constant int64_t* grad_input_sizes,
+    constant int64_t* grad_input_strides,
+    int32_t grad_input_leading_offset,
+    int32_t pooling_dims) {
+  int32_t size_prod = 1;
+  int32_t pool_offset = 0;
+
+  for (int32_t dim = pooling_dims - 1; dim >= 0; dim--) {
+    int32_t next_size_prod = grad_input_sizes[dim] * size_prod;
+    pool_offset +=
+        grad_input_strides[dim] * ((input_index % next_size_prod) / size_prod);
+    size_prod *= grad_input_sizes[dim];
+  }
+
+  AtomicType<T>::atomic_add(
+      grad_input, grad_input_leading_offset + pool_offset, grad_output_element);
+}
+
+// Kernel computes one element of the grad input per kernel call.
+template <typename T>
+kernel void max_pool_backward(
+    device AtomicType_t<T>* grad_input [[buffer(0)]],
+    constant T* grad_output [[buffer(1)]],
+    constant int64_t* indices [[buffer(2)]],
+    constant PoolingBackwardParams<5>& params [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  int32_t pooling_dims = params.pooling_dims;
+  int32_t dims = params.dims;
+  constant int64_t* grad_input_sizes = params.grad_input_sizes.data();
+  constant int64_t* grad_input_strides = params.grad_input_strides.data();
+  constant int64_t* grad_output_sizes = params.grad_output_sizes.data();
+  constant int64_t* grad_output_strides = params.grad_output_strides.data();
+  constant int64_t* indices_strides = params.indices_strides.data();
+
+  int32_t leading_dims = dims - pooling_dims;
+
+  PoolOffsets offsets = find_pool_offsets(
+      grad_output_sizes,
+      grad_output_strides,
+      indices_strides,
+      grad_input_strides,
+      nullptr,
+      dims,
+      leading_dims,
+      tid);
+
+  max_pool_backward_impl<T>(
+      grad_input,
+      grad_output[offsets.output],
+      indices[offsets.indices],
+      grad_input_sizes + leading_dims,
+      grad_input_strides + leading_dims,
+      offsets.input_leading,
+      pooling_dims);
+}
+
 #define REGISTER_MAX_POOL_OP(DTYPE)                                       \
  template [[host_name("max_pool_" #DTYPE)]] kernel void max_pool<DTYPE>( \
      constant void* input_ [[buffer(0)]],                                \
@ -162,6 +260,15 @@ kernel void max_pool(
      constant PoolingParams<5>& params [[buffer(4)]],                    \
      uint tid [[thread_position_in_grid]]);

+#define REGISTER_MAX_POOL_BACKWARD_OP(DTYPE)                   \
+  template [[host_name("max_pool_backward_" #DTYPE)]]          \
+  kernel void max_pool_backward<DTYPE>(                        \
+      device AtomicType_t<DTYPE> * grad_input [[buffer(0)]],   \
+      constant DTYPE * grad_output_ [[buffer(1)]],             \
+      constant int64_t* grad_indices_ [[buffer(2)]],           \
+      constant PoolingBackwardParams<5>& params [[buffer(3)]], \
+      uint tid [[thread_position_in_grid]]);
+
 REGISTER_MAX_POOL_OP(float);
 REGISTER_MAX_POOL_OP(half);
 REGISTER_MAX_POOL_OP(int);
@ -170,6 +277,11 @@ REGISTER_MAX_POOL_OP(short);
 REGISTER_MAX_POOL_OP(char);
 REGISTER_MAX_POOL_OP(uchar);
 REGISTER_MAX_POOL_OP(bool);
+
+REGISTER_MAX_POOL_BACKWARD_OP(float);
+REGISTER_MAX_POOL_BACKWARD_OP(half);
+
 #if __METAL_VERSION__ >= 310
 REGISTER_MAX_POOL_OP(bfloat);
+REGISTER_MAX_POOL_BACKWARD_OP(bfloat);
 #endif
--- a/aten/src/ATen/native/mps/kernels/ScanKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/ScanKernel.metal
@ -7,6 +7,24 @@ using namespace metal;

 using c10::metal::accum_t;

+struct LogAddExp {
+  template <typename T>
+  T operator()(T x, T y) {
+    // Reference:
+    // https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
+    T min_val = c10::metal::min(x, y);
+    T max_val = c10::metal::max(x, y);
+
+    if (min_val != max_val || metal::isfinite(min_val)) {
+      // nan will be propagated here
+      return c10::metal::log1p(metal::exp(min_val - max_val)) + max_val;
+    } else {
+      // special case to correctly handle infinite cases
+      return x;
+    }
+  };
+};
+
 #if __METAL_VERSION__ < 310
 template <typename T, typename acc_t = accum_t<T>>
 struct CumMinOp {
@ -32,6 +50,16 @@ struct CumMaxOp {
  }
 };

+template <typename T, typename acc_t = accum_t<T>>
+struct LogCumSumExpOp {
+  static acc_t apply(acc_t x, acc_t y) {
+    return LogAddExp{}(x, y);
+  }
+  static acc_t identity() {
+    return -metal::numeric_limits<acc_t>::infinity();
+  }
+};
+
 // Inclusive scan along innermost dimension for contiguous tensors
 template <typename T, typename Op, typename acc_t = accum_t<T>>
 kernel void scan_contiguous_innermost_dim(
@ -345,6 +373,10 @@ kernel void scan_with_indices_strided(
      constant uint& scan_dim [[buffer(8)]],                                   \
      uint thread_index [[thread_position_in_grid]]);

+// Simple scan operations
+REGISTER_SCAN_OP(logcumsumexp, LogCumSumExpOp, float);
+REGISTER_SCAN_OP(logcumsumexp, LogCumSumExpOp, half);
+
 // Scan operations with indices
 REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, float);
 REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, half);
@ -438,6 +470,30 @@ inline bool simd_shuffle(bool data, uint16_t lane) {
    return simd_shuffle_and_fill_up(val, init, 1);                       \
  }

+template <typename T, typename acc_t = accum_t<T>>
+struct LogCumSumExpOp {
+  static constexpr constant acc_t init = static_cast<acc_t>(
+      metal::is_floating_point_v<T> ? -metal::numeric_limits<T>::infinity()
+                                    : metal::numeric_limits<T>::lowest());
+
+  acc_t operator()(acc_t a, acc_t b) {
+    return LogAddExp{}(a, b);
+  }
+
+  acc_t simd_scan(acc_t x) {
+    for (int i = 1; i <= 16; i *= 2) {
+      acc_t other = simd_shuffle_and_fill_up(x, init, i);
+      x = LogAddExp{}(x, other);
+    }
+    return x;
+  }
+
+  acc_t simd_exclusive_scan(acc_t x) {
+    x = simd_scan(x);
+    return simd_shuffle_and_fill_up(x, init, 1);
+  }
+};
+
 // Pair structure to hold value and index for cummin/cummax operations
 template <typename T, typename acc_t = accum_t<T>>
 struct ValueIndexPair {
@ -642,6 +698,203 @@ inline T ceildiv(T N, U M) {
  return (N + M - 1) / M;
 }

+// Inclusive scan along innermost dimension for contiguous tensors
+template <typename T, typename Op, int N_READS, typename acc_t = accum_t<T>>
+kernel void scan_innermost_dim(
+    const device T* in [[buffer(0)]],
+    device T* out [[buffer(1)]],
+    const constant size_t& axis_size [[buffer(2)]],
+    uint3 gid [[threadgroup_position_in_grid]],
+    uint3 gsize [[threadgroups_per_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint3 lsize [[threads_per_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
+  constexpr int simd_size = 32;
+  Op op;
+
+  // Position the pointers
+  size_t offset = (gid.y + gsize.y * size_t(gid.z)) * axis_size;
+  in += offset;
+  out += offset;
+
+  // Compute the number of simd_groups
+  uint simd_groups = lsize.x / simd_size;
+
+  // Allocate memory
+  acc_t prefix = Op::init;
+  acc_t values[N_READS];
+  threadgroup acc_t simdgroup_sums[32];
+
+  // Loop over the reduced axis in blocks of size ceildiv(axis_size,
+  // N_READS*lsize)
+  //    Read block
+  //    Compute inclusive scan of the block
+  //      Compute inclusive scan per thread
+  //      Compute exclusive scan of thread sums in simdgroup
+  //      Write simdgroup sums in SM
+  //      Compute exclusive scan of simdgroup sums
+  //      Compute the output by scanning prefix, prev_simdgroup, prev_thread,
+  //      value
+  //    Write block
+
+  for (uint r = 0; r < ceildiv(axis_size, N_READS * lsize.x); r++) {
+    // Compute the block offset
+    uint offset = r * lsize.x * N_READS + lid.x * N_READS;
+
+    // Read the values
+    if ((offset + N_READS) < axis_size) {
+      load_unsafe<T, N_READS>(values, in + offset);
+    } else {
+      load_safe<T, N_READS>(values, in + offset, offset, axis_size, Op::init);
+    }
+
+    // Compute an inclusive scan per thread
+    for (int i = 1; i < N_READS; i++) {
+      values[i] = op(values[i], values[i - 1]);
+    }
+
+    // Compute exclusive scan of thread sums
+    acc_t prev_thread = op.simd_exclusive_scan(values[N_READS - 1]);
+
+    // Write simdgroup_sums to SM
+    if (simd_lane_id == simd_size - 1) {
+      simdgroup_sums[simd_group_id] = op(prev_thread, values[N_READS - 1]);
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Compute exclusive scan of simdgroup_sums
+    if (simd_group_id == 0) {
+      acc_t prev_simdgroup =
+          op.simd_exclusive_scan(simdgroup_sums[simd_lane_id]);
+      simdgroup_sums[simd_lane_id] = prev_simdgroup;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Compute the output
+    for (int i = 0; i < N_READS; i++) {
+      values[i] = op(values[i], prefix);
+      values[i] = op(values[i], simdgroup_sums[simd_group_id]);
+      values[i] = op(values[i], prev_thread);
+    }
+
+    // Write the values
+    if ((offset + N_READS) < axis_size) {
+      write_unsafe<T, N_READS>(values, out + offset);
+    } else {
+      write_safe<T, N_READS>(values, out + offset, offset, axis_size);
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Share the prefix
+    if (simd_group_id == simd_groups - 1 && simd_lane_id == simd_size - 1) {
+      simdgroup_sums[0] = values[N_READS - 1];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    prefix = simdgroup_sums[0];
+  }
+}
+
+// Inclusive scan along outer dimension for contiguous tensors
+template <typename T, typename Op, int N_READS, typename acc_t = accum_t<T>>
+kernel void scan_outer_dim(
+    const device T* in [[buffer(0)]],
+    device T* out [[buffer(1)]],
+    const constant size_t& axis_size [[buffer(2)]],
+    const constant size_t& stride [[buffer(3)]],
+    const constant size_t& stride_blocks [[buffer(4)]],
+    uint3 gid [[threadgroup_position_in_grid]],
+    uint3 gsize [[threadgroups_per_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
+  constexpr int simd_size = 32;
+  constexpr int BM = 32;
+  constexpr int BN = 32;
+  constexpr int BN_pad = 32 + 16 / sizeof(T);
+  constexpr int n_simds = BN / N_READS;
+  constexpr int n_scans = BN / n_simds;
+  Op op;
+
+  threadgroup acc_t read_buffer[BM * BN_pad];
+  acc_t values[n_scans];
+  acc_t prefix[n_scans];
+  for (int i = 0; i < n_scans; i++) {
+    prefix[i] = Op::init;
+  }
+
+  // Compute offsets
+  size_t full_gid = gid.y + gsize.y * size_t(gid.z);
+  size_t offset = full_gid / stride_blocks * axis_size * stride;
+  size_t global_index_x = full_gid % stride_blocks * BN;
+  uint read_offset_y = (lid.x * N_READS) / BN;
+  uint read_offset_x = (lid.x * N_READS) % BN;
+  uint scan_offset_y = simd_lane_id;
+  uint scan_offset_x = simd_group_id * n_scans;
+
+  uint stride_limit = stride - global_index_x;
+  in += offset + global_index_x + read_offset_x;
+  out += offset + global_index_x + read_offset_x;
+  threadgroup acc_t* read_into =
+      read_buffer + read_offset_y * BN_pad + read_offset_x;
+  threadgroup acc_t* read_from =
+      read_buffer + scan_offset_y * BN_pad + scan_offset_x;
+
+  for (uint j = 0; j < axis_size; j += BM) {
+    // Calculate the indices for the current thread
+    uint index_y = j + read_offset_y;
+    uint check_index_y = index_y;
+
+    // Read into shared memory with type conversion
+    if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
+      for (int i = 0; i < N_READS; i++) {
+        read_into[i] = static_cast<acc_t>(in[index_y * stride + i]);
+      }
+    } else {
+      for (int i = 0; i < N_READS; i++) {
+        if (check_index_y < axis_size && (read_offset_x + i) < stride_limit) {
+          read_into[i] = static_cast<acc_t>(in[index_y * stride + i]);
+        } else {
+          read_into[i] = Op::init;
+        }
+      }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Read strided into registers
+    for (int i = 0; i < n_scans; i++) {
+      values[i] = read_from[i];
+    }
+    simdgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Perform the scan
+    for (int i = 0; i < n_scans; i++) {
+      values[i] = op.simd_scan(values[i]);
+      values[i] = op(values[i], prefix[i]);
+      prefix[i] = simd_shuffle(values[i], simd_size - 1);
+    }
+
+    // Write to shared memory
+    for (int i = 0; i < n_scans; i++) {
+      read_from[i] = values[i];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Write to device memory with type conversion
+    if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
+      for (int i = 0; i < N_READS; i++) {
+        out[index_y * stride + i] = static_cast<T>(read_into[i]);
+      }
+    } else {
+      for (int i = 0; i < N_READS; i++) {
+        if (check_index_y < axis_size && (read_offset_x + i) < stride_limit) {
+          out[index_y * stride + i] = static_cast<T>(read_into[i]);
+        }
+      }
+    }
+  }
+}
+
 template <typename T, typename Op, int N_READS, typename acc_t = accum_t<T>>
 kernel void scan_with_indices_innermost_dim(
    const device T* in [[buffer(0)]],
@ -829,6 +1082,32 @@ kernel void scan_with_indices_outer_dim(
  }
 }

+#define REGISTER_SCAN_OP(OP_NAME, OP_CLASS, DTYPE, NREADS)              \
+  template [[host_name(#OP_NAME "_innermost_" #DTYPE)]] [[kernel]] void \
+  scan_innermost_dim<DTYPE, OP_CLASS<DTYPE>, NREADS>(                   \
+      const device DTYPE* in [[buffer(0)]],                             \
+      device DTYPE* out [[buffer(1)]],                                  \
+      const constant size_t& axis_size [[buffer(2)]],                   \
+      uint3 gid [[threadgroup_position_in_grid]],                       \
+      uint3 gsize [[threadgroups_per_grid]],                            \
+      uint3 lid [[thread_position_in_threadgroup]],                     \
+      uint3 lsize [[threads_per_threadgroup]],                          \
+      uint simd_lane_id [[thread_index_in_simdgroup]],                  \
+      uint simd_group_id [[simdgroup_index_in_threadgroup]]);           \
+                                                                        \
+  template [[host_name(#OP_NAME "_outer_" #DTYPE)]] [[kernel]] void     \
+  scan_outer_dim<DTYPE, OP_CLASS<DTYPE>, NREADS>(                       \
+      const device DTYPE* in [[buffer(0)]],                             \
+      device DTYPE* out [[buffer(1)]],                                  \
+      const constant size_t& axis_size [[buffer(2)]],                   \
+      const constant size_t& stride [[buffer(3)]],                      \
+      const constant size_t& stride_blocks [[buffer(4)]],               \
+      uint3 gid [[threadgroup_position_in_grid]],                       \
+      uint3 gsize [[threadgroups_per_grid]],                            \
+      uint3 lid [[thread_position_in_threadgroup]],                     \
+      uint simd_lane_id [[thread_index_in_simdgroup]],                  \
+      uint simd_group_id [[simdgroup_index_in_threadgroup]])
+
 #define REGISTER_SCAN_WITH_INDICES_OP(OP_NAME, OP_CLASS, DTYPE, NREADS) \
  template [[host_name(#OP_NAME "_innermost_" #DTYPE)]] [[kernel]] void \
  scan_with_indices_innermost_dim<DTYPE, OP_CLASS<DTYPE>, NREADS>(      \
@ -857,6 +1136,11 @@ kernel void scan_with_indices_outer_dim(
      uint simd_lane_id [[thread_index_in_simdgroup]],                  \
      uint simd_group_id [[simdgroup_index_in_threadgroup]])

+// Simple scan operations
+REGISTER_SCAN_OP(logcumsumexp, LogCumSumExpOp, float, 4);
+REGISTER_SCAN_OP(logcumsumexp, LogCumSumExpOp, half, 4);
+REGISTER_SCAN_OP(logcumsumexp, LogCumSumExpOp, bfloat, 4);
+
 // Scan with indices operations for cummin/cummax
 REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, float, 4);
 REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, half, 4);
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@ -119,6 +119,30 @@ static void chebyshev_polynomial_w_mps_kernel(TensorIteratorBase& iter) {
  lib.exec_binary_kernel(iter, "chebyshev_polynomial_w");
 }

+static void shifted_chebyshev_polynomial_t_mps_kernel(TensorIteratorBase& iter) {
+  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
+                   "shifted_chebyshev_polynomial_t_mps not implemented for non-floating types");
+  lib.exec_binary_kernel(iter, "shifted_chebyshev_polynomial_t");
+}
+
+static void shifted_chebyshev_polynomial_u_mps_kernel(TensorIteratorBase& iter) {
+  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
+                   "shifted_chebyshev_polynomial_u_mps not implemented for non-floating types");
+  lib.exec_binary_kernel(iter, "shifted_chebyshev_polynomial_u");
+}
+
+static void shifted_chebyshev_polynomial_v_mps_kernel(TensorIteratorBase& iter) {
+  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
+                   "shifted_chebyshev_polynomial_v_mps not implemented for non-floating types");
+  lib.exec_binary_kernel(iter, "shifted_chebyshev_polynomial_v");
+}
+
+static void shifted_chebyshev_polynomial_w_mps_kernel(TensorIteratorBase& iter) {
+  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
+                   "shifted_chebyshev_polynomial_w_mps not implemented for non-floating types");
+  lib.exec_binary_kernel(iter, "shifted_chebyshev_polynomial_w");
+}
+
 static void hermite_polynomial_h_mps_kernel(TensorIteratorBase& iter) {
  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
                   "hermite_polynomial_h_mps not implemented for non-floating types");
@ -177,6 +201,10 @@ REGISTER_DISPATCH(chebyshev_polynomial_t_stub, &chebyshev_polynomial_t_mps_kerne
 REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_mps_kernel)
 REGISTER_DISPATCH(chebyshev_polynomial_v_stub, &chebyshev_polynomial_v_mps_kernel)
 REGISTER_DISPATCH(chebyshev_polynomial_w_stub, &chebyshev_polynomial_w_mps_kernel)
+REGISTER_DISPATCH(shifted_chebyshev_polynomial_t_stub, &shifted_chebyshev_polynomial_t_mps_kernel)
+REGISTER_DISPATCH(shifted_chebyshev_polynomial_u_stub, &shifted_chebyshev_polynomial_u_mps_kernel)
+REGISTER_DISPATCH(shifted_chebyshev_polynomial_v_stub, &shifted_chebyshev_polynomial_v_mps_kernel)
+REGISTER_DISPATCH(shifted_chebyshev_polynomial_w_stub, &shifted_chebyshev_polynomial_w_mps_kernel)
 REGISTER_DISPATCH(hermite_polynomial_h_stub, &hermite_polynomial_h_mps_kernel)
 REGISTER_DISPATCH(hermite_polynomial_he_stub, &hermite_polynomial_he_mps_kernel)
 REGISTER_DISPATCH(polar_stub, &polar_mps_kernel);
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@ -2,6 +2,7 @@

 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/BatchLinearAlgebra.h>
 #include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
@ -22,7 +23,6 @@
 #include <ATen/ops/bmm_native.h>
 #include <ATen/ops/cholesky_native.h>
 #include <ATen/ops/linalg_cholesky_ex_native.h>
-#include <ATen/ops/linalg_cholesky_native.h>
 #include <ATen/ops/linalg_inv_ex_native.h>
 #include <ATen/ops/linalg_lu_factor_ex_native.h>
 #include <ATen/ops/linalg_lu_factor_native.h>
@ -1097,25 +1097,8 @@ static void lu_unpack_mps_impl(const Tensor& LU_data,
  }
 }

-static void linalg_cholesky_mps_impl(const Tensor& input,
-                                     bool upper,
-                                     bool check_errors,
-                                     const Tensor& out,
-                                     const Tensor& info) {
-  using namespace mps;
-
-  TORCH_CHECK(out.is_mps());
-  TORCH_CHECK(input.scalar_type() == at::ScalarType::Float, "linalg.cholesky: Input tensor must be float32");
-  TORCH_CHECK(input.dim() >= 2, "linalg.cholesky: Input tensor must be at least 2D");
-  TORCH_CHECK(input.size(-2) == input.size(-1), "linalg.cholesky: Input tensor must be square");
-  auto input_sizes = input.sizes();
-  resize_output(out, input_sizes);
-  resize_output(info, {input_sizes.begin(), input_sizes.end() - 2});
-  if (input.numel() == 0) {
-    info.zero_();
-    return;
-  }
-  out.copy_(input);
+static void cholesky_stub_impl(const Tensor& out, const Tensor& info, bool upper) {
+  auto input_sizes = out.sizes();

  int64_t ndim = out.dim();
  int64_t N = out.size(-1);
@ -1124,9 +1107,9 @@ static void linalg_cholesky_mps_impl(const Tensor& input,
  auto stream = getCurrentMPSStream();
  auto device = MPSDevice::getInstance()->device();

-  auto factorDiagonalPSO = lib.getPipelineStateForFunc("factorDiagonalBlock");
-  auto applyTRSMPSO = lib.getPipelineStateForFunc("applyTRSM");
-  auto applySYRKPSO = lib.getPipelineStateForFunc("applySYRK");
+  auto factorDiagonalPSO = lib.getPipelineStateForFunc(upper ? "factorDiagonalBlockU" : "factorDiagonalBlockL");
+  auto applyTRSMPSO = lib.getPipelineStateForFunc(upper ? "applyTRSMU" : "applyTRSML");
+  auto applySYRKPSO = lib.getPipelineStateForFunc(upper ? "applySYRKU" : "applySYRKL");

  int64_t NB = std::min<int64_t>(32, N);
  int64_t numBlocks = (N + NB - 1) / NB;
@ -1168,33 +1151,8 @@ static void linalg_cholesky_mps_impl(const Tensor& input,
      }
    });
  }
-  int status;
-  if (check_errors) {
-    if (info_.dim() > 0) {
-      // batch case
-      for (const auto i : c10::irange(B)) {
-        status = info_[i].item<int>();
-        TORCH_CHECK(
-            status == 0,
-            "linalg.cholesky(): (Batch element ",
-            i,
-            "):  The factorization could not be completed because the input is not positive-definite (the leading minor of order ",
-            status,
-            " is not positive-definite).");
-      }
-    } else {
-      // single matrix case(no batch size)
-      status = info.item<int>();
-      TORCH_CHECK(
-          status == 0,
-          "linalg.cholesky(): The factorization could not be completed because the input is not positive-definite (the leading minor of order ",
-          status,
-          " is not positive-definite).");
-    }
-  }
-  out.tril_();
-  upper ? out.transpose_(ndim - 2, ndim - 1) : out;
 }
+
 } // namespace mps

 Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, const Scalar& beta, const Scalar& alpha) {
@ -1355,23 +1313,6 @@ Tensor& addbmm_out_mps(const Tensor& self,
  return result;
 }

-Tensor cholesky_mps(const Tensor& self, bool upper) {
-  auto out = at::empty_like(self, MemoryFormat::Contiguous);
-  cholesky_mps_out(self, upper, out);
-  return out;
-}
-
-Tensor& cholesky_mps_out(const Tensor& self, bool upper, Tensor& out) {
-  auto info = at::empty({}, self.options().dtype(kInt));
-  mps::linalg_cholesky_mps_impl(self, upper, true, out, info);
-  return out;
-}
-
-TORCH_IMPL_FUNC(linalg_cholesky_ex_out_mps)
-(const Tensor& self, bool upper, bool check_errors, const Tensor& L, const Tensor& info) {
-  mps::linalg_cholesky_mps_impl(self, upper, check_errors, L, info);
-}
-
 Tensor addbmm_mps(const Tensor& self,
                  const Tensor& batch1,
                  const Tensor& batch2,
@ -1460,4 +1401,6 @@ TORCH_IMPL_FUNC(linalg_lu_factor_ex_out_mps)
 TORCH_IMPL_FUNC(linalg_inv_ex_out_mps)(const Tensor& A, bool check_errors, const Tensor& result, const Tensor& info) {
  mps::linalg_inv_ex_out_mps_impl(A, check_errors, result, info);
 }
+
+REGISTER_DISPATCH(cholesky_stub, mps::cholesky_stub_impl)
 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@ -18,6 +18,7 @@
 #include <ATen/ops/max_pool2d_native.h>
 #include <ATen/ops/max_pool2d_with_indices_backward_native.h>
 #include <ATen/ops/max_pool2d_with_indices_native.h>
+#include <ATen/ops/max_pool3d_with_indices_backward_native.h>
 #include <ATen/ops/max_pool3d_with_indices_native.h>
 #endif

@ -270,16 +271,16 @@ static IntArrayRef tensor_to_intarrayref(const Tensor& tensor) {
  return IntArrayRef(data_ptr, length);
 }

-static void max_pool_with_indices_out_mps_template(const Tensor& output,
-                                                   const Tensor& indices,
-                                                   const Tensor& input,
-                                                   IntArrayRef kernel_size,
-                                                   IntArrayRef stride,
-                                                   IntArrayRef padding,
-                                                   IntArrayRef dilation,
-                                                   bool ceil_mode,
-                                                   const int32_t pooling_dims,
-                                                   const std::string& op_name) {
+using PoolSizes = std::tuple<int32_t, Tensor, Tensor, Tensor, Tensor, Tensor>;
+
+static PoolSizes process_pool_sizes(const Tensor& input,
+                                    IntArrayRef kernel_size,
+                                    IntArrayRef stride,
+                                    IntArrayRef padding,
+                                    IntArrayRef dilation,
+                                    bool ceil_mode,
+                                    const int32_t pooling_dims,
+                                    const std::string& op_name) {
  TORCH_INTERNAL_ASSERT(pooling_dims == 1 || pooling_dims == 2 || pooling_dims == 3);

  const int32_t dims = input.dim();
@ -387,9 +388,27 @@ static void max_pool_with_indices_out_mps_template(const Tensor& output,

  t_output_size.slice(0, leading_dims) = t_output_pooling_size;

+  return std::tuple<int32_t, Tensor, Tensor, Tensor, Tensor, Tensor>(
+      dims, t_output_size, t_kernel_size, t_stride, t_padding, t_dilation);
+}
+
+static void max_pool_with_indices_out_mps_template(const Tensor& output,
+                                                   const Tensor& indices,
+                                                   const Tensor& input,
+                                                   IntArrayRef kernel_size,
+                                                   IntArrayRef stride,
+                                                   IntArrayRef padding,
+                                                   IntArrayRef dilation,
+                                                   bool ceil_mode,
+                                                   const int32_t pooling_dims,
+                                                   const std::string& op_name) {
+  auto [dims, t_output_size, t_kernel_size, t_stride, t_padding, t_dilation] =
+      process_pool_sizes(input, kernel_size, stride, padding, dilation, ceil_mode, pooling_dims, op_name);
+
  IntArrayRef output_size = tensor_to_intarrayref(t_output_size);
-  output.resize_(output_size);
-  indices.resize_(output_size);
+  const auto memory_format = input.suggest_memory_format();
+  output.resize_(output_size, memory_format);
+  indices.resize_(output_size, memory_format);

  auto iter = TensorIteratorConfig().add_output(output).resize_outputs(false).check_all_same_dtype(false).build();

@ -436,6 +455,52 @@ static void max_pool_with_indices_out_mps_template(const Tensor& output,
  });
 }

+static void max_pool_with_indices_backward_out_mps_template(Tensor& grad_input,
+                                                            const Tensor& indices,
+                                                            const Tensor& input,
+                                                            const Tensor& grad_output,
+                                                            IntArrayRef kernel_size,
+                                                            IntArrayRef stride,
+                                                            IntArrayRef padding,
+                                                            IntArrayRef dilation,
+                                                            bool ceil_mode,
+                                                            const int32_t pooling_dims,
+                                                            const std::string& op_name) {
+  auto [dims, t_output_size, t_kernel_size, t_stride, t_padding, t_dilation] =
+      process_pool_sizes(input, kernel_size, stride, padding, dilation, ceil_mode, pooling_dims, op_name);
+
+  const auto memory_format = input.suggest_memory_format();
+  grad_input.resize_(input.sizes(), memory_format);
+  grad_input.fill_(0);
+
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  const auto numThreads = grad_output.numel();
+  PoolingBackwardParams<5> params;
+
+  params.dims = dims;
+  params.pooling_dims = pooling_dims;
+  memcpy(params.grad_input_sizes.data(), grad_input.sizes().data(), dims * sizeof(int64_t));
+  memcpy(params.grad_input_strides.data(), grad_input.strides().data(), dims * sizeof(int64_t));
+  memcpy(params.grad_output_strides.data(), grad_output.strides().data(), dims * sizeof(int64_t));
+  memcpy(params.grad_output_sizes.data(), grad_output.sizes().data(), dims * sizeof(int64_t));
+  memcpy(params.indices_strides.data(), indices.strides().data(), dims * sizeof(int64_t));
+
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      auto maxPoolPSO = lib.getPipelineStateForFunc("max_pool_backward_" + scalarToMetalTypeString(input));
+
+      getMPSProfiler().beginProfileKernel(maxPoolPSO, op_name, {input});
+      [computeEncoder setComputePipelineState:maxPoolPSO];
+      mtl_setArgs(computeEncoder, grad_input, grad_output, indices, params);
+
+      mtl_dispatch1DJob(computeEncoder, maxPoolPSO, numThreads);
+      getMPSProfiler().endProfileKernel(maxPoolPSO);
+    }
+  });
+}
+
 static void avg_pool2d_template(const Tensor& input,
                                const Tensor& output,
                                const std::optional<Tensor>& grad_output_opt,
@ -738,6 +803,52 @@ std::tuple<Tensor, Tensor> max_pool3d_with_indices_mps(const Tensor& input,
  return std::tuple<Tensor, Tensor>(output, indices);
 }

+Tensor& max_pool3d_with_indices_backward_out_mps(const Tensor& grad_output,
+                                                 const Tensor& input,
+                                                 IntArrayRef kernel_size,
+                                                 IntArrayRef stride,
+                                                 IntArrayRef padding,
+                                                 IntArrayRef dilation,
+                                                 bool ceil_mode,
+                                                 const Tensor& indices,
+                                                 Tensor& grad_input) {
+  mps::max_pool_with_indices_backward_out_mps_template(grad_input,
+                                                       indices,
+                                                       input,
+                                                       grad_output,
+                                                       kernel_size,
+                                                       stride,
+                                                       padding,
+                                                       dilation,
+                                                       ceil_mode,
+                                                       /*pooling_dims=*/3,
+                                                       "max_pool3d_backward");
+  return grad_input;
+}
+
+Tensor max_pool3d_with_indices_backward_mps(const Tensor& grad_output,
+                                            const Tensor& input,
+                                            IntArrayRef kernel_size,
+                                            IntArrayRef stride,
+                                            IntArrayRef padding,
+                                            IntArrayRef dilation,
+                                            bool ceil_mode,
+                                            const Tensor& indices) {
+  auto grad_input = at::empty({0}, input.options());
+  mps::max_pool_with_indices_backward_out_mps_template(grad_input,
+                                                       indices,
+                                                       input,
+                                                       grad_output,
+                                                       kernel_size,
+                                                       stride,
+                                                       padding,
+                                                       dilation,
+                                                       ceil_mode,
+                                                       /*pooling_dims=*/3,
+                                                       "max_pool3d_backward");
+  return grad_input;
+}
+
 TORCH_IMPL_FUNC(avg_pool2d_out_mps)
 (const Tensor& input,
 int64_t kH,
--- a/aten/src/ATen/native/mps/operations/ScanKernel.mm
+++ b/aten/src/ATen/native/mps/operations/ScanKernel.mm
@ -10,6 +10,7 @@
 #else
 #include <ATen/ops/_cummax_helper_native.h>
 #include <ATen/ops/_cummin_helper_native.h>
+#include <ATen/ops/_logcumsumexp_native.h>
 #endif
 #include <fmt/format.h>

@ -163,6 +164,111 @@ static std::pair<uint32_t, uint32_t> get_2d_grid_dims(const IntArrayRef& shape,
  return {static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y)};
 }

+static void scan_simple_mps_impl(const Tensor& self, const Tensor& output, int64_t dim, const std::string& op_name) {
+  if (output.numel() == 0) {
+    return;
+  }
+
+  const int64_t ndim = self.dim();
+  const int64_t wrapped_dim = maybe_wrap_dim(dim, ndim);
+  const int64_t axis_size = self.size(wrapped_dim);
+
+  // Preprocess input tensor - ensure it's contiguous for Metal shaders
+  Tensor input_tensor = self.contiguous();
+
+  // Preprocess output tensor - ensure it's contiguous for Metal shaders
+  Tensor output_tensor = output;
+  bool output_needs_copy = !output.is_contiguous();
+  Tensor temp_output;
+
+  if (output_needs_copy) {
+    // Create a temporary contiguous tensor with the same shape and type
+    temp_output = at::empty_like(output, output.options()).contiguous();
+    output_tensor = temp_output;
+  }
+
+  // Determine which kernel to use based on scan dimension position
+  bool is_innermost_scan = (wrapped_dim == ndim - 1);
+
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+
+      // Build kernel name based on scan dimension position
+      const auto type_str = scalarToMetalTypeString(input_tensor);
+      const auto kernel_name = fmt::format("{}_{}_{}", op_name, is_innermost_scan ? "innermost" : "outer", type_str);
+
+      id<MTLComputePipelineState> scanPSO = lib.getPipelineStateForFunc(kernel_name);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(scanPSO, op_name, [&]() {
+        std::vector<Tensor> all_tensors = {input_tensor, output_tensor};
+        return all_tensors;
+      }());
+
+      [computeEncoder setComputePipelineState:scanPSO];
+
+      // Set input and output buffers (both guaranteed contiguous)
+      mtl_setBuffer(computeEncoder, input_tensor, 0);
+      mtl_setBuffer(computeEncoder, output_tensor, 1);
+
+      if (is_innermost_scan) {
+        // Contiguous scan dispatch (scanning innermost dimension)
+        mtl_setBytes(computeEncoder, axis_size, 2);
+
+        int n_reads = (input_tensor.element_size() <= 4) ? 4 : 2;
+        constexpr int simd_size = 32;
+        int elements_per_simd = n_reads * simd_size;
+        int thread_group_size = static_cast<int>(scanPSO.maxTotalThreadsPerThreadgroup);
+
+        if (axis_size <= n_reads * 1024) {
+          thread_group_size = ((axis_size + elements_per_simd - 1) / elements_per_simd) * simd_size;
+        } else if (axis_size <= n_reads * 2048) {
+          thread_group_size = ((axis_size / 2 + elements_per_simd - 1) / elements_per_simd) * simd_size;
+        }
+        thread_group_size = std::min(thread_group_size, static_cast<int>(scanPSO.maxTotalThreadsPerThreadgroup));
+
+        auto tmp_grid_dims = get_2d_grid_dims(input_tensor.sizes(), wrapped_dim);
+
+        [computeEncoder dispatchThreads:MTLSizeMake(thread_group_size, tmp_grid_dims.first, tmp_grid_dims.second)
+                  threadsPerThreadgroup:MTLSizeMake(thread_group_size, 1, 1)];
+      } else {
+        // Strided scan dispatch (scanning non-innermost dimension)
+        size_t stride = input_tensor.strides()[wrapped_dim];
+        constexpr int bn = 32;
+        size_t stride_blocks = (stride + bn - 1) / bn;
+
+        mtl_setBytes(computeEncoder, axis_size, 2);
+        mtl_setBytes(computeEncoder, stride, 3);
+        mtl_setBytes(computeEncoder, stride_blocks, 4);
+
+        int n_reads = (input_tensor.element_size() <= 4) ? 4 : 2;
+        int n_simdgroups = bn / n_reads;
+        constexpr int simd_size = 32;
+        int thread_group_size = n_simdgroups * simd_size;
+
+        auto tmp_grid_dims = get_2d_grid_dims(input_tensor.sizes(), wrapped_dim);
+        if (tmp_grid_dims.first * stride_blocks <= UINT_MAX) {
+          tmp_grid_dims.first *= stride_blocks;
+        } else {
+          tmp_grid_dims.second *= stride_blocks;
+        }
+
+        [computeEncoder dispatchThreads:MTLSizeMake(thread_group_size, tmp_grid_dims.first, tmp_grid_dims.second)
+                  threadsPerThreadgroup:MTLSizeMake(thread_group_size, 1, 1)];
+      }
+
+      getMPSProfiler().endProfileKernel(scanPSO);
+    }
+  });
+
+  // Post-process: copy result back to original output tensor if needed
+  if (output_needs_copy) {
+    output.copy_(output_tensor);
+  }
+}
+
 // Specialized implementation for cummin/cummax that returns both values and indices
 static void scan_with_indices_mps_impl(const Tensor& self,
                                       const Tensor& values_output,
@ -284,4 +390,29 @@ void cummin_helper_mps(const Tensor& self, Tensor& values, Tensor& indices, int6
  }
 }

+Tensor& _logcumsumexp_out_mps(const Tensor& self, int64_t dim, Tensor& result) {
+  const auto wrap_dim = maybe_wrap_dim(dim, self.dim());
+  result.resize_(self.sizes());
+  if (self.dim() == 0) {
+    result.fill_(self);
+    return result;
+  }
+  if (self.numel() == 0) {
+    result.zero_();
+    return result;
+  }
+
+  if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) {
+    mps::scan_simple_mps_impl(self, result, wrap_dim, "logcumsumexp");
+  } else {
+    mps::scan_mps_impl(self, {result}, wrap_dim, "logcumsumexp");
+  }
+  return result;
+}
+
+Tensor _logcumsumexp_mps(const Tensor& self, int64_t dim) {
+  Tensor result = at::empty_like(self, MemoryFormat::Contiguous);
+  return _logcumsumexp_out_mps(self, dim, result);
+}
+
 } // namespace at::native
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -3741,11 +3741,13 @@
  dispatch:
    CPU: _logcumsumexp_cpu
    CUDA: _logcumsumexp_cuda
+    MPS: _logcumsumexp_mps

 - func: _logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: _logcumsumexp_out_cpu
    CUDA: _logcumsumexp_out_cuda
+    MPS: _logcumsumexp_out_mps

 - func: logcumsumexp(Tensor self, int dim) -> Tensor
  variants: function, method
@ -9478,14 +9480,12 @@

 - func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
-    CPU, CUDA: cholesky_out
-    MPS: cholesky_mps_out
+    CPU, CUDA, MPS: cholesky_out

 - func: cholesky(Tensor self, bool upper=False) -> Tensor
  variants: method, function
  dispatch:
-    CPU, CUDA: cholesky
-    MPS: cholesky_mps
+    CPU, CUDA, MPS: cholesky

 - func: cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
@ -12442,12 +12442,14 @@
  dispatch:
    CPU: max_pool3d_with_indices_backward_out_cpu
    CUDA: max_pool3d_with_indices_backward_out_cuda
+    MPS: max_pool3d_with_indices_backward_out_mps

 - func: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor
  python_module: nn
  dispatch:
    CPU: max_pool3d_with_indices_backward_cpu
    CUDA: max_pool3d_with_indices_backward_cuda
+    MPS: max_pool3d_with_indices_backward_mps

 - func: max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
@ -13939,8 +13941,7 @@
  python_module: linalg
  structured: True
  dispatch:
-    CPU, CUDA: linalg_cholesky_ex_out
-    MPS: linalg_cholesky_ex_out_mps
+    CPU, CUDA, MPS: linalg_cholesky_ex_out

 - func: linalg_cholesky(Tensor self, *, bool upper=False) -> Tensor
  python_module: linalg
@ -15592,7 +15593,7 @@
 - func: special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck
  dispatch:
-    CPU, CUDA: special_shifted_chebyshev_polynomial_t_out
+    CPU, CUDA, MPS: special_shifted_chebyshev_polynomial_t_out
  python_module: special
  structured_inherits: TensorIteratorBase
  structured: True
@ -15641,7 +15642,7 @@
 - func: special_shifted_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck
  dispatch:
-    CPU, CUDA: special_shifted_chebyshev_polynomial_u_out
+    CPU, CUDA, MPS: special_shifted_chebyshev_polynomial_u_out
  python_module: special
  structured_inherits: TensorIteratorBase
  structured: True
@ -15690,7 +15691,7 @@
 - func: special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck
  dispatch:
-    CPU, CUDA: special_shifted_chebyshev_polynomial_v_out
+    CPU, CUDA, MPS: special_shifted_chebyshev_polynomial_v_out
  python_module: special
  structured_inherits: TensorIteratorBase
  structured: True
@ -15739,7 +15740,7 @@
 - func: special_shifted_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck
  dispatch:
-    CPU, CUDA: special_shifted_chebyshev_polynomial_w_out
+    CPU, CUDA, MPS: special_shifted_chebyshev_polynomial_w_out
  python_module: special
  structured_inherits: TensorIteratorBase
  structured: True
--- a/aten/src/ATen/ops/from_blob.h
+++ b/aten/src/ATen/ops/from_blob.h
@ -5,7 +5,7 @@ namespace at {

 namespace detail {

-TORCH_API inline void noopDelete(void*) {}
+inline void noopDelete(void*) {}

 } // namespace detail

--- a/aten/src/ATen/templates/Functions.h
+++ b/aten/src/ATen/templates/Functions.h
@ -83,16 +83,16 @@ ${Functions_declarations}
 // Special C++ only overloads for std()-like functions (See gh-40287)
 // These are needed because int -> bool conversion takes precedence over int -> IntArrayRef
 // So, for example std(0) would select the std(unbiased=False) overload
-TORCH_API inline Tensor var(const Tensor& self, int dim) {
+inline Tensor var(const Tensor& self, int dim) {
  return at::var(self, IntArrayRef{dim});
 }
-TORCH_API inline std::tuple<Tensor, Tensor> var_mean(const Tensor& self, int dim) {
+inline std::tuple<Tensor, Tensor> var_mean(const Tensor& self, int dim) {
  return at::var_mean(self, IntArrayRef{dim});
 }
-TORCH_API inline Tensor std(const Tensor& self, int dim) {
+inline Tensor std(const Tensor& self, int dim) {
  return at::std(self, IntArrayRef{dim});
 }
-TORCH_API inline std::tuple<Tensor, Tensor> std_mean(const Tensor& self, int dim) {
+inline std::tuple<Tensor, Tensor> std_mean(const Tensor& self, int dim) {
  return at::std_mean(self, IntArrayRef{dim});
 }

--- a/aten/src/ATen/test/cpu_caching_allocator_test.cpp
+++ b/aten/src/ATen/test/cpu_caching_allocator_test.cpp
@ -5,6 +5,9 @@

 #include <c10/mobile/CPUCachingAllocator.h>

+// At the moment caching allocator is only exposed to mobile cpu allocator.
+#ifdef C10_MOBILE
+
 TEST(CPUCachingAllocatorTest, check_alloc_free) {
  c10::CPUCachingAllocator caching_allocator;
  c10::WithCPUCachingAllocatorGuard cachine_allocator_guard(
@ -41,10 +44,9 @@ TEST(CPUCachingAllocatorTest, check_alloc_inside_free_outside) {
 }

 int main(int argc, char* argv[]) {
-// At the moment caching allocator is only exposed to mobile cpu allocator.
-#ifdef C10_MOBILE
  ::testing::InitGoogleTest(&argc, argv);
  at::manual_seed(42);
  return RUN_ALL_TESTS();
-#endif /* C10_Mobile */
 }
+
+#endif /* C10_Mobile */
--- a/aten/src/ATen/test/cuda_complex_test.cu
+++ b/aten/src/ATen/test/cuda_complex_test.cu
@ -5,14 +5,14 @@
 __global__ void test_thrust_kernel() {
  // thrust conversion
  {
-  constexpr float num1 = float(1.23);
-  constexpr float num2 = float(4.56);
+  [[maybe_unused]] constexpr float num1 = float(1.23);
+  [[maybe_unused]] constexpr float num2 = float(4.56);
  assert(c10::complex<float>(thrust::complex<float>(num1, num2)).real() == num1);
  assert(c10::complex<float>(thrust::complex<float>(num1, num2)).imag() == num2);
  }
  {
-  constexpr double num1 = double(1.23);
-  constexpr double num2 = double(4.56);
+  [[maybe_unused]] constexpr double num1 = double(1.23);
+  [[maybe_unused]] constexpr double num2 = double(4.56);
  assert(c10::complex<double>(thrust::complex<double>(num1, num2)).real() == num1);
  assert(c10::complex<double>(thrust::complex<double>(num1, num2)).imag() == num2);
  }
@ -46,11 +46,11 @@ __global__ void test_reinterpret_cast() {
  assert(zzzz.real() == double(1));
  assert(zzzz.imag() == double(2));

-  cuComplex cuComplex_zz = *reinterpret_cast<cuComplex*>(&zz);
+  [[maybe_unused]] cuComplex cuComplex_zz = *reinterpret_cast<cuComplex*>(&zz);
  assert(cuComplex_zz.x == float(1));
  assert(cuComplex_zz.y == float(2));

-  cuDoubleComplex cuDoubleComplex_zzzz = *reinterpret_cast<cuDoubleComplex*>(&zzzz);
+  [[maybe_unused]] cuDoubleComplex cuDoubleComplex_zzzz = *reinterpret_cast<cuDoubleComplex*>(&zzzz);
  assert(cuDoubleComplex_zzzz.x == double(1));
  assert(cuDoubleComplex_zzzz.y == double(2));
 }
--- a/aten/src/ATen/test/cuda_half_test.cu
+++ b/aten/src/ATen/test/cuda_half_test.cu
@ -33,7 +33,7 @@ __device__ void test(){
  // use the std namespace, but just "::" so that the function
  // gets resolved from nvcc math_functions.hpp

-  float threshold = 0.00001;
+  [[maybe_unused]] float threshold = 0.00001;
  assert(::abs(::lgamma(Half(10.0)) - ::lgamma(10.0f)) <= threshold);
  assert(::abs(::exp(Half(1.0)) - ::exp(1.0f)) <= threshold);
  assert(::abs(::log(Half(1.0)) - ::log(1.0f)) <= threshold);
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@ -61,6 +61,8 @@ namespace {
    template <typename T>
    class QuantizationTests : public ::testing::Test {};
    template <typename T>
+    class Quantization8BitTests : public ::testing::Test {};
+    template <typename T>
    class Quantization8BitWithTailTests : public ::testing::Test {};
    template <typename T>
    class FunctionalTests : public ::testing::Test {};
@ -79,6 +81,7 @@ namespace {
    using FloatTestedTypes = ::testing::Types<vfloat, vdouble, vcomplex, vcomplexDbl>;
    using ALLTestedTypes = ::testing::Types<vfloat, vdouble, vcomplex, vlong, vint, vshort, vqint8, vquint8, vqint>;
    using QuantTestedTypes = ::testing::Types<vqint8, vquint8, vqint>;
+    using Quantization8BitTestedTypes = ::testing::Types<vqint8, vquint8>;
 #if (defined(CPU_CAPABILITY_AVX2) ||  defined(CPU_CAPABILITY_AVX512))  && !defined(_MSC_VER)
    using Quantization8BitWithTailTestedTypes =
        ::testing::Types<vqint8, vquint8>;
@ -116,6 +119,7 @@ namespace {
    TYPED_TEST_SUITE(BitwiseFloatsAdditional, RealFloatReducedFloatTestedTypes);
    TYPED_TEST_SUITE(BitwiseFloatsAdditional2, FloatTestedTypes);
    TYPED_TEST_SUITE(QuantizationTests, QuantTestedTypes);
+    TYPED_TEST_SUITE(Quantization8BitTests, Quantization8BitTestedTypes);
    TYPED_TEST_SUITE(InfiniteTests, RealFloatTestedTypes);
 #if (defined(CPU_CAPABILITY_AVX2) ||  defined(CPU_CAPABILITY_AVX512))  && !defined(_MSC_VER)
    TYPED_TEST_SUITE(
@ -1496,6 +1500,68 @@ namespace {
            },
            test_case);
    }
+#ifndef _WIN32
+    TYPED_TEST(Quantization8BitTests, Transpose) {
+        using VT = ValueType<TypeParam>;
+        constexpr auto M = 4;
+        constexpr auto N = 64;
+        constexpr auto L = M * N;
+        constexpr auto ld_src = N;
+        constexpr auto ld_dst = M;
+        CACHE_ALIGN VT x[L];
+        CACHE_ALIGN VT y[L];
+        CACHE_ALIGN VT ref[L];
+        auto seed = TestSeed();
+        ValueGen<VT> generator(VT(-100), VT(100), seed);
+        for (const auto i : c10::irange(L)) {
+          x[i] = generator.get();
+        }
+        at::native::utils::transpose<uint8_t>(
+            M, N,
+            reinterpret_cast<uint8_t*>(x), ld_src,
+            reinterpret_cast<uint8_t*>(y), ld_dst);
+        for (int64_t j = 0; j < N; j++) {
+          for (int64_t i = 0; i < M; i++) {
+            ref[j * ld_dst + i] = c10::load(&(x[i * ld_src + j]));
+          }
+        }
+        for (const auto i : c10::irange(L)) {
+          ASSERT_EQ(y[i], ref[i])
+              << "Failure Details:\nTest Seed to reproduce: " << seed;
+        }
+    }
+#endif
+#if defined(CPU_CAPABILITY_AVX512)
+    TYPED_TEST(Quantization8BitTests, PackVNNI4) {
+        using VT = ValueType<TypeParam>;
+        constexpr auto K = 8;
+        constexpr auto N = 128;
+        constexpr auto L = K * N;
+        constexpr auto ld_src = N;
+        CACHE_ALIGN VT x[L];
+        CACHE_ALIGN VT y[L];
+        CACHE_ALIGN VT ref[L];
+        auto seed = TestSeed();
+        ValueGen<VT> generator(VT(-100), VT(100), seed);
+        for (const auto i : c10::irange(L)) {
+          x[i] = generator.get();
+        }
+        at::vec::pack_vnni4(x, y, ld_src, K, N);
+        int64_t _K = K / 4;
+        for (int64_t k = 0; k < _K; k++) {
+          for(int64_t n = 0; n < N; n++) {
+            for(int64_t l = 0; l < 4; l++) {
+              ref[k * N * 4 + n * 4 + l] =
+                  c10::load(&(x[k * ld_src * 4 + l * ld_src + n]));
+            }
+          }
+        }
+        for (const auto i : c10::irange(L)) {
+          ASSERT_EQ(y[i], ref[i])
+              << "Failure Details:\nTest Seed to reproduce: " << seed;
+        }
+    }
+#endif
    TYPED_TEST(FunctionalTests, Map) {
        using vec = TypeParam;
        using VT = ValueType<TypeParam>;
--- a/aten/src/ATen/test/vec_test_all_types.h
+++ b/aten/src/ATen/test/vec_test_all_types.h
@ -1,6 +1,7 @@
 #pragma once
-#include <ATen/cpu/vec/vec.h>
 #include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/vec_quant.h>
 #include <c10/util/bit_cast.h>
 #include <c10/util/irange.h>
 #include <gtest/gtest.h>
@ -21,7 +22,9 @@
 #else
 #define CACHE_LINE 32
 #endif
-
+#ifndef _WIN32
+#include <ATen/native/cpu/utils.h>
+#endif
 #if defined(__GNUC__)
 #define CACHE_ALIGN __attribute__((aligned(CACHE_LINE)))
 #define not_inline __attribute__((noinline))
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -601,6 +601,7 @@ libtorch_nativert_sources = [
    "torch/nativert/executor/Placement.cpp",
    "torch/nativert/executor/ExecutionPlanner.cpp",
    "torch/nativert/executor/ExecutionFrame.cpp",
+    "torch/nativert/executor/Executor.cpp",
    "torch/nativert/executor/GraphExecutorBase.cpp",
    "torch/nativert/executor/ConstantFolder.cpp",
    "torch/nativert/executor/OpKernel.cpp",
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@ -634,7 +634,7 @@ class DispatchKeySet final {
 C10_API std::string toString(DispatchKeySet);
 C10_API std::ostream& operator<<(std::ostream&, DispatchKeySet);

-C10_API inline int getDispatchTableIndexForDispatchKey(DispatchKey k) {
+inline int getDispatchTableIndexForDispatchKey(DispatchKey k) {
  return DispatchKeySet(k).getDispatchTableIndexForDispatchKeySet();
 }

--- a/c10/metal/special_math.h
+++ b/c10/metal/special_math.h
@ -1559,7 +1559,7 @@ float chebyshev_polynomial_t_forward(T x, int64_t n) {
  float q = x;
  float r;

-  for (int64_t k = 2; k <= n; k++) {
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
    r = (x + x) * q - p;
    p = q;
    q = r;
@ -1603,7 +1603,7 @@ float chebyshev_polynomial_u_forward(T x, int64_t n) {
  auto p = 1.0;
  float r;

-  for (int64_t k = 2; k <= n; k++) {
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
    r = 2 * x * q - p;
    p = q;
    q = r;
@ -1656,7 +1656,7 @@ float chebyshev_polynomial_v_forward(T x, int64_t n) {
  auto p = 1.0;
  float r;

-  for (int64_t k = 2; k <= n; k++) {
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
    r = 2 * x * q - p;
    p = q;
    q = r;
@ -1713,7 +1713,7 @@ float chebyshev_polynomial_w_forward(T x, int64_t n) {
  auto p = 1.0;
  float r;

-  for (int64_t k = 2; k <= n; k++) {
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
    r = 2.0 * x * q - p;
    p = q;
    q = r;
@ -1722,6 +1722,207 @@ float chebyshev_polynomial_w_forward(T x, int64_t n) {
  return r;
 } // chebyshev_polynomial_w_forward(T x, int64_t n)

+template <typename T>
+float shifted_chebyshev_polynomial_t_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (x == T(1.0)) {
+    return 1.0;
+  }
+
+  if (x == 0.0) {
+    if (n % 2 == 0) {
+      return 1.0;
+    }
+
+    return -1.0;
+  }
+
+  const float xpxm1 = x + x - 1.0;
+  if ((n > 6) && (::metal::abs(xpxm1) < 1.0)) {
+    return ::metal::precise::cos(n * ::metal::precise::acos(xpxm1));
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  if (n == 1) {
+    return xpxm1;
+  }
+
+  float p = 1.0;
+  float q = xpxm1;
+  float r;
+
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+    r = (xpxm1 + xpxm1) * q - p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // shifted_chebyshev_polynomial_t_forward(T x, int64_t n)
+
+template <typename T>
+float shifted_chebyshev_polynomial_u_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (x == 1.0) {
+    return n + 1;
+  }
+
+  if (x == 0.0) {
+    if (n % 2 == 0) {
+      return n + 1;
+    }
+
+    return -(n + 1);
+  }
+  const float xpxm1 = x + x - 1.0;
+  if ((n > 6) && (::metal::abs(xpxm1) < 1.0)) {
+    const float acos_2xm1 = ::metal::precise::acos(xpxm1);
+    const float divisor = ::metal::precise::sin(acos_2xm1);
+    if (divisor != 0.0) {
+      return ::metal::precise::sin((n + 1) * acos_2xm1) / divisor;
+    }
+
+    return (n + 1) * ::metal::precise::cos((n + 1) * acos_2xm1) / xpxm1;
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  if (n == 1) {
+    return xpxm1 + xpxm1;
+  }
+
+  float p = 1.0;
+  float q = xpxm1 + xpxm1;
+  float r;
+
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+    r = (xpxm1 + xpxm1) * q - p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // shifted_chebyshev_polynomial_u_forward(T x, int64_t n)
+
+template <typename T>
+float shifted_chebyshev_polynomial_v_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (x == 1.0) {
+    return 1.0;
+  }
+
+  if (x == 0.0) {
+    if (n % 2 == 0) {
+      return (n + n + 1);
+    }
+
+    return -(n + n + 1);
+  }
+
+  const float xpxm1 = x + x - 1.0;
+  if ((n > 6) && (::metal::abs(xpxm1) < 1.0)) {
+    const float acos_2xm1 = ::metal::precise::acos(xpxm1);
+    if (::metal::precise::sin(acos_2xm1 / 2.0) != 1.0) {
+      return ::metal::precise::cos((n + 0.5) * acos_2xm1) /
+          ::metal::precise::cos(acos_2xm1 / 2.0);
+    }
+
+    if (n % 2 == 0) {
+      return n + n + 1;
+    }
+
+    return -(n + n + 1);
+  }
+
+  if (n == 0) {
+    return T(1.0);
+  }
+
+  if (n == 1) {
+    return xpxm1 + xpxm1 - 1.0;
+  }
+
+  float p = 1.0;
+  float q = xpxm1 + xpxm1 - 1.0;
+  float r;
+
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+    r = (xpxm1 + xpxm1) * q - p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // shifted_chebyshev_polynomial_v_forward(T x, int64_t n)
+
+template <typename T>
+float shifted_chebyshev_polynomial_w_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (x == 1.0) {
+    return n + n + 1;
+  }
+
+  if (x == 0.0) {
+    if (n % 2 == 0) {
+      return 1.0;
+    }
+
+    return -1.0;
+  }
+
+  const float xpxm1 = x + x - 1.0;
+  if ((n > 4) && (::metal::abs(xpxm1) < 1.0)) {
+    const float acos_2xm1 = ::metal::precise::acos(xpxm1);
+    if (::metal::precise::cos(acos_2xm1 / 2.0) != 1.0) {
+      return ::metal::precise::sin((n + 0.5) * acos_2xm1) /
+          ::metal::precise::sin(acos_2xm1 / 2.0);
+    }
+
+    if (n % 2 == 0) {
+      return 1.0;
+    }
+
+    return -1.0;
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  if (n == 1) {
+    return xpxm1 + xpxm1 + 1.0;
+  }
+
+  float p = 1.0;
+  float q = xpxm1 + xpxm1 + 1.0;
+  float r;
+
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+    r = (xpxm1 + xpxm1) * q - p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // shifted_chebyshev_polynomial_w_forward(T x, int64_t n)
+
 template <typename T>
 // TODO: Add 512 if/when double will be supported in Metal
 inline constexpr int getHermitianLimit() {
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .3.1
 .4.0