Lazy import to avoid circular import issue

DisableTorchFunction in debug_string
fix
2025-10-24 15:44:58 +08:00 · 2025-09-19 15:47:03 -07:00 · 2025-09-16 13:00:05 -07:00 · 2025-09-15 20:28:14 -07:00 · 2025-09-15 20:27:27 -07:00 · 2025-09-15 14:54:28 -07:00
394 changed files with 10358 additions and 3693 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -31,7 +31,8 @@ pip install -r /pytorch/requirements.txt
 pip install auditwheel==6.2.0 wheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
-    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
+    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
    export USE_SYSTEM_NCCL=1
@ -45,5 +46,6 @@ else
        export USE_NVIDIA_PYPI_LIBS=1
    fi

-    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
+    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -13,6 +13,49 @@ def list_dir(path: str) -> list[str]:
    return check_output(["ls", "-1", path]).decode().split("\n")


+def build_ArmComputeLibrary() -> None:
+    """
+    Using ArmComputeLibrary for aarch64 PyTorch
+    """
+    print("Building Arm Compute Library")
+    acl_build_flags = [
+        "debug=0",
+        "neon=1",
+        "opencl=0",
+        "os=linux",
+        "openmp=1",
+        "cppthreads=0",
+        "arch=armv8a",
+        "multi_isa=1",
+        "fixed_format_kernels=1",
+        "build=native",
+    ]
+    acl_install_dir = "/acl"
+    acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
+    if os.path.isdir(acl_install_dir):
+        shutil.rmtree(acl_install_dir)
+    if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
+        check_call(
+            [
+                "git",
+                "clone",
+                "https://github.com/ARM-software/ComputeLibrary.git",
+                "-b",
+                "v25.02",
+                "--depth",
+                "1",
+                "--shallow-submodules",
+            ]
+        )
+
+    check_call(
+        ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
+        cwd=acl_checkout_dir,
+    )
+    for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
+        shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
+
+
 def replace_tag(filename) -> None:
    with open(filename) as f:
        lines = f.readlines()
@ -274,7 +317,7 @@ if __name__ == "__main__":
    ).decode()

    print("Building PyTorch wheel")
-    build_vars = ""
+    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
    if enable_cuda:
        build_vars += "MAX_JOBS=5 "
@ -313,13 +356,19 @@ if __name__ == "__main__":
        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "

    if enable_mkldnn:
+        build_ArmComputeLibrary()
        print("build pytorch with mkldnn+acl backend")
-        build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
-        build_vars += "ACL_ROOT_DIR=/acl "
+        build_vars += (
+            "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
+            "ACL_ROOT_DIR=/acl "
+            "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "
+            "ACL_INCLUDE_DIR=/acl/build "
+            "ACL_LIBRARY=/acl/build "
+        )
        if enable_cuda:
            build_vars += "BLAS=NVPL "
        else:
-            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
+            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS "
    else:
        print("build pytorch without mkldnn backend")

--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -299,6 +299,40 @@ def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
        )


+def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None:
+    print("Building OpenBLAS")
+    host.run_cmd(
+        f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.28 {git_clone_flags}"
+    )
+    make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8"
+    host.run_cmd(
+        f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS"
+    )
+
+
+def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None:
+    print("Building Arm Compute Library")
+    acl_build_flags = " ".join(
+        [
+            "debug=0",
+            "neon=1",
+            "opencl=0",
+            "os=linux",
+            "openmp=1",
+            "cppthreads=0",
+            "arch=armv8a",
+            "multi_isa=1",
+            "fixed_format_kernels=1",
+            "build=native",
+        ]
+    )
+    host.run_cmd(
+        f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}"
+    )
+
+    host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
+
+
 def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
    host.run_cmd("pip3 install auditwheel")
    host.run_cmd(
@ -666,6 +700,7 @@ def start_build(
    configure_system(
        host, compiler=compiler, use_conda=use_conda, python_version=python_version
    )
+    build_OpenBLAS(host, git_clone_flags)

    if host.using_docker():
        print("Move libgfortant.a into a standard location")
@ -688,8 +723,6 @@ def start_build(
        f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
    )

-    host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
-
    print("Building PyTorch wheel")
    build_opts = ""
    if pytorch_build_number is not None:
@ -710,18 +743,15 @@ def start_build(
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
    if enable_mkldnn:
-        host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
+        build_ArmComputeLibrary(host, git_clone_flags)
        print("build pytorch with mkldnn+acl backend")
        build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
-        build_vars += " BLAS=OpenBLAS"
-        build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
-        build_vars += " ACL_ROOT_DIR=/acl"
        host.run_cmd(
-            f"cd $HOME/pytorch && {build_vars} python3 setup.py bdist_wheel{build_opts}"
+            f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}"
        )
        print("Repair the wheel")
        pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
-        ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
+        ld_library_path = "$HOME/acl/build:$HOME/pytorch/build/lib"
        host.run_cmd(
            f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
        )
@ -877,7 +907,7 @@ def terminate_instances(instance_type: str) -> None:
 def parse_arguments():
    from argparse import ArgumentParser

-    parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
+    parser = ArgumentParser("Builid and test AARCH64 wheels using EC2")
    parser.add_argument("--key-name", type=str)
    parser.add_argument("--debug", action="store_true")
    parser.add_argument("--build-only", action="store_true")
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -214,8 +214,7 @@ case "$tag" in
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
-    # TODO (huydhn): Upgrade this to Python >= 3.10
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    VISION=yes
    KATEX=yes
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-fccfc522864cf8bc172abe0cd58ae5581e2d44b9
+5ae38bdb0dc066c5823e34dc9797afb9de42c866
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -1,27 +1,16 @@
-#!/bin/bash
-# Script used only in CD pipeline
+set -euo pipefail

-set -eux
-
-ACL_VERSION=${ACL_VERSION:-"v25.02"}
-ACL_INSTALL_DIR="/acl"
+readonly version=v25.02
+readonly src_host=https://github.com/ARM-software
+readonly src_repo=ComputeLibrary

 # Clone ACL
-git clone https://github.com/ARM-software/ComputeLibrary.git -b "${ACL_VERSION}" --depth 1 --shallow-submodules
+[[ ! -d ${src_repo} ]] && git clone ${src_host}/${src_repo}.git
+cd ${src_repo}
+
+git checkout $version

-ACL_CHECKOUT_DIR="ComputeLibrary"
 # Build with scons
-pushd $ACL_CHECKOUT_DIR
 scons -j8  Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \
  os=linux arch=armv8a build=native multi_isa=1 \
  fixed_format_kernels=1 openmp=1 cppthreads=0
-popd
-
-# Install ACL
-sudo mkdir -p ${ACL_INSTALL_DIR}
-for d in arm_compute include utils support src build
-do
-  sudo cp -r ${ACL_CHECKOUT_DIR}/${d} ${ACL_INSTALL_DIR}/${d}
-done
-
-rm -rf $ACL_CHECKOUT_DIR
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -3,10 +3,8 @@

 set -ex

-OPENBLAS_VERSION=${OPENBLAS_VERSION:-"v0.3.30"}
-
-# Clone OpenBLAS
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" --depth 1 --shallow-submodules
+cd /
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules

 OPENBLAS_CHECKOUT_DIR="OpenBLAS"
 OPENBLAS_BUILD_FLAGS="
@ -19,7 +17,5 @@ CFLAGS=-O3
 BUILD_BFLOAT16=1
 "

-make -j8 ${OPENBLAS_BUILD_FLAGS} -C $OPENBLAS_CHECKOUT_DIR
-sudo make install -C $OPENBLAS_CHECKOUT_DIR
-
-rm -rf $OPENBLAS_CHECKOUT_DIR
+make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
+make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -62,13 +62,6 @@ ARG OPENBLAS_VERSION
 ADD ./common/install_openblas.sh install_openblas.sh
 RUN bash ./install_openblas.sh && rm install_openblas.sh

-# Install Arm Compute Library
-FROM base as arm_compute
-# use python3.9 to install scons
-RUN python3.9 -m pip install scons==4.7.0
-RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin
-COPY ./common/install_acl.sh install_acl.sh
-RUN bash ./install_acl.sh && rm install_acl.sh
 FROM base as final

 # remove unnecessary python versions
@ -77,5 +70,4 @@ RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
-COPY --from=arm_compute /acl /acl
-ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:/acl/build/:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -28,7 +28,6 @@ fi
 MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
 DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
 OPENBLAS_VERSION=${OPENBLAS_VERSION:-}
-ACL_VERSION=${ACL_VERSION:-}

 case ${image} in
    manylinux2_28-builder:cpu)
@ -42,6 +41,7 @@ case ${image} in
        GPU_IMAGE=arm64v8/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
        MANY_LINUX_VERSION="2_28_aarch64"
+        OPENBLAS_VERSION="v0.3.30"
        ;;
    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
        TARGET=final
@ -121,8 +121,7 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
 DOCKER_BUILDKIT=1 docker build  \
    ${DOCKER_GPU_BUILD_ARG} \
    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
-    --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION:-}" \
-    --build-arg "ACL_VERSION=${ACL_VERSION:-}" \
+    --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \
    --target "${TARGET}" \
    -t "${tmp_tag}" \
    $@ \
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@ -66,6 +66,11 @@ class VllmBuildParameters:
        "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
    )

+    # the cleaning script to remove torch dependencies from pip
+    cleaning_script: Path = env_path_field(
+        "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
+    )
+
    # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
    output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")

@ -160,6 +165,7 @@ class VllmBuildRunner(BaseRunner):
        logger.info("Running vllm build with inputs: %s", inputs)
        vllm_commit = clone_vllm()

+        self.cp_torch_cleaning_script(inputs)
        self.cp_dockerfile_if_exist(inputs)
        # cp torch wheels from root direct to vllm workspace if exist
        self.cp_torch_whls_if_exist(inputs)
@ -205,6 +211,11 @@ class VllmBuildRunner(BaseRunner):
        copy(inputs.torch_whls_path, tmp_dir)
        return tmp_dir

+    def cp_torch_cleaning_script(self, inputs: VllmBuildParameters):
+        script = get_path(inputs.cleaning_script, resolve=True)
+        vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
+        copy(script, vllm_script)
+
    def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):
        if not inputs.use_local_dockerfile:
            logger.info("using vllm default dockerfile.torch_nightly for build")
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@ -11,7 +11,7 @@ from typing import Any

 from cli.lib.common.cli_helper import BaseRunner
 from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env
-from cli.lib.common.path_helper import copy, remove_dir
+from cli.lib.common.path_helper import copy, get_path, remove_dir
 from cli.lib.common.pip_helper import (
    pip_install_first_match,
    pip_install_packages,
@ -43,6 +43,10 @@ class VllmTestParameters:

    torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")

+    cleaning_script: Path = env_path_field(
+        "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
+    )
+
    def __post_init__(self):
        if not self.torch_whls_path.exists():
            raise ValueError("missing torch_whls_path")
@ -92,11 +96,13 @@ class VllmTestRunner(BaseRunner):
        self._set_envs(params)

        clone_vllm(dst=self.work_directory)
+        self.cp_torch_cleaning_script(params)
        with working_directory(self.work_directory):
            remove_dir(Path("vllm"))
            self._install_wheels(params)
            self._install_dependencies()
        # verify the torches are not overridden by test dependencies
+
        check_versions()

    def run(self):
@ -125,6 +131,11 @@ class VllmTestRunner(BaseRunner):
            # double check the torches are not overridden by other packages
            check_versions()

+    def cp_torch_cleaning_script(self, params: VllmTestParameters):
+        script = get_path(params.cleaning_script, resolve=True)
+        vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
+        copy(script, vllm_script)
+
    def _install_wheels(self, params: VllmTestParameters):
        logger.info("Running vllm test with inputs: %s", params)
        if not pkg_exists("torch"):
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -89,7 +89,7 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
  export USE_MKLDNN=1
  export USE_MKLDNN_ACL=1
-  export ACL_ROOT_DIR=/acl
+  export ACL_ROOT_DIR=/ComputeLibrary
 fi

 if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -258,11 +258,19 @@ function install_torchrec_and_fbgemm() {
      git clone --recursive https://github.com/pytorch/fbgemm
      pushd fbgemm/fbgemm_gpu
      git checkout "${fbgemm_commit}" --recurse-submodules
-      python setup.py bdist_wheel \
-        --build-variant=rocm \
-        -DHIP_ROOT_DIR="${ROCM_PATH}" \
-        -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
-        -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
+      # until the fbgemm_commit includes the tbb patch
+      patch <<'EOF'
+--- a/FbgemmGpu.cmake
+++ b/FbgemmGpu.cmake
+@@ -184,5 +184,6 @@ gpu_cpp_library(
+     fbgemm_gpu_tbe_cache
+     fbgemm_gpu_tbe_optimizers
+     fbgemm_gpu_tbe_utils
+    tbb
+   DESTINATION
+     fbgemm_gpu)
+EOF
+      python setup.py bdist_wheel --build-variant=rocm
      popd

      # Save the wheel before cleaning up
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -35,11 +35,10 @@ fi

 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
-  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
-  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
-  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
+  # backends (specifically the gloo backend), so test that this case works too
  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -13,9 +13,13 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd

+python -mpip install -r requirements.txt
+
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1

+python -mpip install --no-input -r requirements.txt
+
 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
  # This environment variable makes ProcessGroupGloo default to
@ -177,6 +181,9 @@ checkout_install_torchbench() {
  popd

  pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt
+  # https://github.com/pytorch/pytorch/issues/160689 to remove torchao because
+  # its current version 0.12.0 doesn't work with transformers 4.54.0
+  pip uninstall -y torchao

  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -386,8 +386,8 @@ def smoke_test_compile(device: str = "cpu") -> None:


 def smoke_test_nvshmem() -> None:
-    if not torch.cuda.is_available():
-        print("CUDA is not available, skipping NVSHMEM test")
+    if not torch.cuda.is_available() or target_os == "windows":
+        print("Windows platform or CUDA is not available, skipping NVSHMEM test")
        return

    # Check if NVSHMEM is compiled in current build
@ -396,7 +396,9 @@ def smoke_test_nvshmem() -> None:
    except ImportError:
        # Not built with NVSHMEM support.
        # torch is not compiled with NVSHMEM prior to 2.9
-        if torch.__version__ < "2.9":
+        from torch.torch_version import TorchVersion
+
+        if TorchVersion(torch.__version__) < (2, 9):
            return
        else:
            # After 2.9: NVSHMEM is expected to be compiled in current build
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -778,6 +778,11 @@ test_single_dynamo_benchmark() {
 }

 test_inductor_micro_benchmark() {
+  # torchao requires cuda 8.0 or above for bfloat16 support
+  if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;8.6"
+  fi
+  install_torchao
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
    test_inductor_set_cpu_affinity
@ -1659,50 +1664,37 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
    elif [[ "${TEST_CONFIG}" == *all* ]]; then
      TEST_MODE="all"
    fi
+
    test_operator_benchmark cpu ${TEST_MODE}
+
  fi
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
-  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-    install_torchao
-  fi
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
  test_inductor_halide
 elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
  test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
-  install_torchao
  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
  install_torchvision
-  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-    install_torchao
-  fi
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark huggingface "$id"
 elif [[ "${TEST_CONFIG}" == *timm* ]]; then
  install_torchvision
-  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-    install_torchao
-  fi
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark timm_models "$id"
 elif [[ "${TEST_CONFIG}" == cachebench ]]; then
  install_torchaudio
  install_torchvision
-  install_torchao
  PYTHONPATH=/torchbench test_cachebench
 elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
  install_torchaudio
  install_torchvision
-  install_torchao
  PYTHONPATH=/torchbench test_verify_cachebench
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  install_torchaudio
  install_torchvision
-  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-    install_torchao
-  fi
  id=$((SHARD_NUMBER-1))
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
@ -1722,24 +1714,13 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  fi
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
-  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-    install_torchao
-  fi
  PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
    test_inductor_aoti
  fi
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
-  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-    install_torchao
-  fi
  test_inductor_shard "${SHARD_NUMBER}"
-  if [[ "${SHARD_NUMBER}" == 1 ]]; then
-    if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then
-      test_inductor_distributed
-    fi
-  fi
 elif [[ "${TEST_CONFIG}" == *einops* ]]; then
  test_einops
 elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -85,7 +85,7 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
 # Create an isolated directory to store this builds pytorch checkout and conda
 # installation
 if [[ -z "$MAC_PACKAGE_WORK_DIR" ]]; then
-    MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_conda_${DESIRED_PYTHON}_$(date +%H%M%S)"
+    MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_${DESIRED_PYTHON}_$(date +%H%M%S)"
 fi
 mkdir -p "$MAC_PACKAGE_WORK_DIR" || true
 if [[ -n ${GITHUB_ACTIONS} ]]; then
@ -96,11 +96,11 @@ fi
 whl_tmp_dir="${MAC_PACKAGE_WORK_DIR}/dist"
 mkdir -p "$whl_tmp_dir"

-mac_version='macosx_11_0_arm64'
+mac_version='macosx-11_0-arm64'
 libtorch_arch='arm64'

 # Create a consistent wheel package name to rename the wheel to
-wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version}.whl"
+wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version//[-,]/_}.whl"

 ###########################################################

@ -125,7 +125,6 @@ popd
 export TH_BINARY_BUILD=1
 export INSTALL_TEST=0 # dont install test binaries into site-packages
 export MACOSX_DEPLOYMENT_TARGET=11.0
-export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}

 EXTRA_CONDA_INSTALL_FLAGS=""
 CONDA_ENV_CREATE_FLAGS=""
@ -133,25 +132,19 @@ RENAME_WHEEL=true
 case $desired_python in
    3.14t)
        echo "Using 3.14 deps"
+        mac_version='macosx-11.0-arm64'
        NUMPY_PINNED_VERSION="==2.1.0"
-        CONDA_ENV_CREATE_FLAGS="python-freethreading"
-        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-        desired_python="3.14.0rc1"
        RENAME_WHEEL=false
        ;;
    3.14)
        echo "Using 3.14t deps"
+        mac_version='macosx-11.0-arm64'
        NUMPY_PINNED_VERSION="==2.1.0"
-        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-        desired_python="3.14.0rc1"
        RENAME_WHEEL=false
        ;;
    3.13t)
        echo "Using 3.13 deps"
        NUMPY_PINNED_VERSION="==2.1.0"
-        CONDA_ENV_CREATE_FLAGS="python-freethreading"
-        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-        desired_python="3.13"
        RENAME_WHEEL=false
        ;;
    3.13)
@ -176,20 +169,16 @@ case $desired_python in
        ;;
 esac

-# Install into a fresh env
-tmp_env_name="wheel_py$python_nodot"
-conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS}
-source activate "$tmp_env_name"
-
 PINNED_PACKAGES=(
    "numpy${NUMPY_PINNED_VERSION}"
 )
-retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
-pip install requests ninja typing-extensions
-retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
+python -mvenv ~/${desired_python}-build
+source ~/${desired_python}-build/bin/activate
+retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
 retry brew install libomp

-# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
+# is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

 export USE_MKLDNN=OFF
@ -199,7 +188,7 @@ export BUILD_TEST=OFF
 pushd "$pytorch_rootdir"
 echo "Calling setup.py bdist_wheel at $(date)"

-python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version}
+_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name "${mac_version//[-.]/_}"

 echo "Finished setup.py bdist_wheel at $(date)"

--- a/.flake8
+++ b/.flake8
@ -73,7 +73,7 @@ exclude =
    ./docs/src,
    ./functorch/docs,
    ./functorch/examples,
-    ./functorch/notebooks,
+    ./functorch/docs/source/tutorials,
    ./scripts,
    ./test/generated_type_hints_smoketest.py,
    ./third_party,
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -21,6 +21,7 @@ self-hosted-runner:
    - linux.arm64.2xlarge.ephemeral
    - linux.arm64.m7g.4xlarge
    - linux.arm64.m7g.4xlarge.ephemeral
+    - linux.arm64.r7g.12xlarge.memory
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-fa5142928ee157aa65137c4ecff2fe9b1a9e0648
+87ff22e49ed0e92576c4935ccb8c143daac4a3cd
--- a/.github/ci_commit_pins/torchao.txt
+++ b/.github/ci_commit_pins/torchao.txt
@ -1 +1 @@
-f32431e593d0e9db86c502d3872dd67ee40a005f
+51c87b6ead6b7e098ada95d6a7609ee873b854cf
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-cc99baf14dacc2497d0c5ed84e076ef2c37f6a4d
+973c9d01da863cac9c51e8a5c0d390fc84b84fbc
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-6c5478ff7c3d50dd1e3047d72ec5909bea474073
+c77852e117bdf056c8e9a087e51d6f65cf6ba53d
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -82,16 +82,10 @@ RUN if command -v apt-get >/dev/null; then \
        apt-get update -y \
        && apt-get install -y ccache software-properties-common git curl wget sudo vim; \
    else \
-        dnf install -y git curl wget sudo vim; \
+        dnf install -y git curl wget sudo; \
    fi \
    && python3 --version && python3 -m pip --version

-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
-
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version >/dev/null 2>&1; then \
@ -220,11 +214,16 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
-        echo "Installing sccache..." \
-        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        echo "Installing sccache..."; \
+        if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+            SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \
+        else \
+            SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \
+        fi; \
+        curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \
        && tar -xzf sccache.tar.gz \
-        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
@ -285,7 +284,7 @@ RUN if command -v apt-get >/dev/null; then \
        && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
        && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
    else \
-        dnf install -y git curl wget sudo vim; \
+        dnf install -y git curl wget sudo; \
    fi \
    && python3 --version && python3 -m pip --version

@ -298,12 +297,6 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
    echo "[INFO] Showing torch_build_versions.txt content:" && \
    cat torch_build_versions.txt

-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
-
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version > /dev/null 2>&1; then \
--- a/.github/ci_configs/vllm/use_existing_torch.py
+++ b/.github/ci_configs/vllm/use_existing_torch.py
@ -0,0 +1,17 @@
+import glob
+
+
+requires_files = glob.glob("requirements/*.txt")
+requires_files += ["pyproject.toml"]
+for file in requires_files:
+    print(f">>> cleaning {file}")
+    with open(file) as f:
+        lines = f.readlines()
+    if "torch" in "".join(lines).lower():
+        print("removed:")
+        with open(file, "w") as f:
+            for line in lines:
+                if "torch" not in line.lower():
+                    f.write(line)
+    print(f"<<< done cleaning {file}")
+    print()
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -15,7 +15,7 @@ optree==0.13.0
 packaging==23.1
 parameterized==0.8.1
 pillow==10.3.0
-protobuf==5.29.4
+protobuf==5.29.5
 psutil==5.9.8
 pygments==2.15.0
 pytest-cpp==2.3.0
@ -26,7 +26,7 @@ pytest-xdist==3.3.1
 pytest==7.3.2
 pyyaml==6.0.2
 scipy==1.12.0
-setuptools==72.1.0
+setuptools==78.1.1
 sympy==1.13.3
 tlparse==0.4.0
 tensorboard==2.13.0
--- a/.github/scripts/docathon-label-sync.py
+++ b/.github/scripts/docathon-label-sync.py
@ -39,7 +39,9 @@ def main() -> None:
    pull_request_label_names = [label.name for label in pull_request_labels]
    issue_label_names = [label.name for label in issue_labels]
    labels_to_add = [
-        label for label in issue_label_names if label not in pull_request_label_names
+        label
+        for label in issue_label_names
+        if label not in pull_request_label_names and label != "actionable"
    ]
    if not labels_to_add:
        print("The pull request already has the same labels.")
--- a/.github/scripts/prepare_vllm_wheels.sh
+++ b/.github/scripts/prepare_vllm_wheels.sh
@ -84,6 +84,9 @@ repackage_wheel() {
  rm -rf $package
 }

+# Require to re-package the wheel
+${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
+
 pushd externals/vllm/wheels
 for package in xformers flashinfer-python vllm; do
  repackage_wheel $package
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -22,6 +22,16 @@ name: !{{ build_environment }}
          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
 {%- endmacro %}

+{%- macro setup_python(py_ver) -%}
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
+          freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
+{%- endmacro %}
+
 on:
 # TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
  push:
@ -61,23 +71,13 @@ jobs:
    {%- endif %}
    steps:
      !{{ set_runner_specific_vars() }}
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      !{{ setup_python(config.get("python_version", "3.10")) }}
      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
      - name: Populate binary env
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -94,8 +94,6 @@ jobs:
 {%- if config["package_type"] == "wheel" %}
      - name: Test PyTorch wheel
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -106,33 +104,9 @@ jobs:

          SMOKE_TEST_PARAMS=""

-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

          # shellcheck disable=SC2086
--- a/.github/workflows/build-vllm-wheel.yml
+++ b/.github/workflows/build-vllm-wheel.yml
@ -12,6 +12,9 @@ on:
    paths:
      - .github/workflows/build-vllm-wheel.yml
      - .github/ci_commit_pins/vllm.txt
+  schedule:
+    # every morning at 01:30PM UTC, 9:30AM EST, 6:30AM PST
+    - cron: 30 13 * * *

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -24,21 +27,33 @@ jobs:
      fail-fast: false
      matrix:
        python-version: [ '3.12' ]
-        # TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
+        # TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
+        platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
        device: [ 'cu128', 'cu129' ]
-        runner: [ 'linux.12xlarge.memory' ]
        include:
-          - device: cu128
+          - platform: manylinux_2_28_x86_64
+            device: cu128
            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
-          - device: cu129
+            runner: linux.12xlarge.memory
+          - platform: manylinux_2_28_x86_64
+            device: cu129
            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
-    name: "Build ${{ matrix.device }} vLLM wheel"
+            runner: linux.12xlarge.memory
+          - platform: manylinux_2_28_aarch64
+            device: cu128
+            manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
+            runner: linux.arm64.r7g.12xlarge.memory
+          - platform: manylinux_2_28_aarch64
+            device: cu129
+            manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
+            runner: linux.arm64.r7g.12xlarge.memory
+    name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
    runs-on: ${{ matrix.runner }}
    timeout-minutes: 480
    env:
      PY_VERS: ${{ matrix.python-version }}
      MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
-      PLATFORM: 'manylinux_2_28_x86_64'
+      PLATFORM: ${{ matrix.platform }}
      BUILD_DEVICE: ${{ matrix.device }}
    steps:
      - name: Setup SSH (Click me for login details)
@ -136,7 +151,7 @@ jobs:

      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
        with:
-          name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
+          name: vllm-wheel-${{ matrix.device }}-${{ matrix.platform }}-${{ matrix.python-version }}
          if-no-files-found: error
          path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl

@ -146,15 +161,17 @@ jobs:

  # Copied from build-triton-wheel workflow (mostly)
  upload-wheel:
-    name: "Upload ${{ matrix.device }} vLLM wheel"
+    name: "Upload ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
    needs:
      - build-wheel
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
+        platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
        device: [ 'cu128', 'cu129' ]
    env:
+      PLATFORM: ${{ matrix.platform }}
      BUILD_DEVICE: ${{ matrix.device }}
    permissions:
      id-token: write
@ -190,15 +207,15 @@ jobs:
        run: |
          set -eux
          mkdir -p "${RUNNER_TEMP}/artifacts/"
-          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
+          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-"${PLATFORM}"-*/* "${RUNNER_TEMP}/artifacts/"

-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
+      - name: Set DRY_RUN
+        if: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
        shell: bash
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"

-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+      - name: Set UPLOAD_CHANNEL
        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
        shell: bash
        run: |
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
@ -60,13 +60,13 @@ jobs:
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          # shellcheck disable=SC2129
          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.10.4"
+          freethreaded: false
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -81,13 +81,9 @@ jobs:
        working-directory: pytorch
      - name: Populate binary env
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -56,13 +56,13 @@ jobs:
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          # shellcheck disable=SC2129
          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.10.4"
+          freethreaded: false
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -77,13 +77,9 @@ jobs:
        working-directory: pytorch
      - name: Populate binary env
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -99,8 +95,6 @@ jobs:
          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
      - name: Test PyTorch wheel
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -111,33 +105,9 @@ jobs:

          SMOKE_TEST_PARAMS=""

-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

          # shellcheck disable=SC2086
@ -196,13 +166,13 @@ jobs:
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          # shellcheck disable=SC2129
          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.11.4"
+          freethreaded: false
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -217,13 +187,9 @@ jobs:
        working-directory: pytorch
      - name: Populate binary env
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -239,8 +205,6 @@ jobs:
          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
      - name: Test PyTorch wheel
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -251,33 +215,9 @@ jobs:

          SMOKE_TEST_PARAMS=""

-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

          # shellcheck disable=SC2086
@ -336,13 +276,13 @@ jobs:
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          # shellcheck disable=SC2129
          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.12.4"
+          freethreaded: false
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -357,13 +297,9 @@ jobs:
        working-directory: pytorch
      - name: Populate binary env
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -379,8 +315,6 @@ jobs:
          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
      - name: Test PyTorch wheel
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -391,33 +325,9 @@ jobs:

          SMOKE_TEST_PARAMS=""

-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

          # shellcheck disable=SC2086
@ -476,13 +386,13 @@ jobs:
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          # shellcheck disable=SC2129
          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.13.4"
+          freethreaded: false
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -497,13 +407,9 @@ jobs:
        working-directory: pytorch
      - name: Populate binary env
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -519,8 +425,6 @@ jobs:
          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
      - name: Test PyTorch wheel
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -531,33 +435,9 @@ jobs:

          SMOKE_TEST_PARAMS=""

-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

          # shellcheck disable=SC2086
@ -616,13 +496,13 @@ jobs:
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          # shellcheck disable=SC2129
          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.13.4"
+          freethreaded: true
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -637,13 +517,9 @@ jobs:
        working-directory: pytorch
      - name: Populate binary env
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -659,8 +535,6 @@ jobs:
          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
      - name: Test PyTorch wheel
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -671,33 +545,9 @@ jobs:

          SMOKE_TEST_PARAMS=""

-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

          # shellcheck disable=SC2086
@ -756,13 +606,13 @@ jobs:
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          # shellcheck disable=SC2129
          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.14.0-rc.2"
+          freethreaded: false
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -777,13 +627,9 @@ jobs:
        working-directory: pytorch
      - name: Populate binary env
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -799,8 +645,6 @@ jobs:
          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
      - name: Test PyTorch wheel
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -811,33 +655,9 @@ jobs:

          SMOKE_TEST_PARAMS=""

-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

          # shellcheck disable=SC2086
@ -896,13 +716,13 @@ jobs:
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          # shellcheck disable=SC2129
          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.14.0-rc.2"
+          freethreaded: true
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -917,13 +737,9 @@ jobs:
        working-directory: pytorch
      - name: Populate binary env
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -939,8 +755,6 @@ jobs:
          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
      - name: Test PyTorch wheel
        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -951,33 +765,9 @@ jobs:

          SMOKE_TEST_PARAMS=""

-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

          # shellcheck disable=SC2086
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -35,8 +35,6 @@ jobs:
    needs:
      - get-default-label-prefix
    with:
-      # More memory is needed to build torchao
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
@ -45,7 +43,6 @@ jobs:
        { include: [
          { config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.aws.a100", owners: ["oncall:pt2"] },
        ]}
-      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

  test:
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@ -37,7 +37,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      test-matrix: |
@ -56,7 +56,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: nightly-dynamo-benchmarks-build
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
      test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
      timeout-minutes: 720
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -137,6 +137,7 @@ jobs:
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
      timeout-minutes: 720
+      # disable monitor in perf tests, next step is to enable it
      disable-monitor: false
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
@ -153,6 +154,7 @@ jobs:
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
      timeout-minutes: 1440
+      # disable monitor in perf tests, next step is to enable it
      disable-monitor: false
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
@ -171,6 +173,7 @@ jobs:
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
      timeout-minutes: 720
+      # disable monitor in perf tests for more investigation
      disable-monitor: false
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@ -75,7 +75,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -101,7 +101,7 @@ jobs:
    needs: inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
@ -118,7 +118,7 @@ jobs:
    needs: inductor-build
    if: github.event_name == 'workflow_dispatch'
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -80,7 +80,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -107,7 +107,7 @@ jobs:
    needs: inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
@ -124,7 +124,7 @@ jobs:
    needs: inductor-build
    if: github.event_name == 'workflow_dispatch'
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -36,12 +36,10 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
-      # More memory is needed to build torchao
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
+      cuda-arch-list: '8.0;8.6'
      test-matrix: |
        { include: [
          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -64,7 +62,7 @@ jobs:
          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -130,8 +128,6 @@ jobs:
    needs:
      - get-default-label-prefix
    with:
-      # More memory is needed to build torchao
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
@ -158,7 +154,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      test-matrix: |
@ -204,7 +200,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: periodic-dynamo-benchmarks-cpu-build
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -33,8 +33,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build torchao
-      runner: linux.2xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
@ -47,7 +45,6 @@ jobs:
          { config: "inductor_cpp_wrapper", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_cpp_wrapper", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
-      build-additional-packages: "vision audio torchao"
    secrets: inherit

  inductor-test:
@ -113,7 +110,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
@ -130,7 +127,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: inductor-cpu-build
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -49,8 +49,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build torchao
-      runner: linux.2xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
@ -81,7 +79,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
@ -103,7 +101,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: inductor-cpu-build
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -54,7 +54,7 @@ jobs:
      - get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
      docker-image: ${{ needs.docs-build.outputs.docker-image }}
      push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }}
      run-doxygen: true
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@ -14,6 +14,10 @@ on:
  schedule:
    # Run at 07:00 UTC every Sunday
    - cron: 0 7 * * 0
+  pull_request:
+    paths:
+      - benchmarks/operator_benchmark/**
+      - .github/workflows/operator_benchmark.yml

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -29,7 +33,7 @@ jobs:
    name: opbenchmark-build
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -42,7 +46,7 @@ jobs:
    name: opbenchmark-on-demand-build
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -55,7 +59,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: opbenchmark-build
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@ -70,5 +70,4 @@ jobs:
      build-environment: linux-noble-rocm-py3.12-mi300
      docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
-      tests-to-include: "inductor/test_ck_backend"
    secrets: inherit
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -239,16 +239,13 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build torchao
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}
-      build-additional-packages: "vision audio torchao"
    secrets: inherit

  verify-cachebench-cpu-test:
@ -258,7 +255,7 @@ jobs:
      - verify-cachebench-cpu-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
      docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -259,9 +259,6 @@ gen
 .pytest_cache
 aten/build/*

-# Linker scripts for prioritized text optimization
-cmake/linker_script.ld
-
 # Bram
 plsdontbreak

@ -392,3 +389,5 @@ android/pytorch_android_torchvision/.cxx

 # Claude Code local configuration
 CLAUDE.local.md
+/test_*.py
+/debug_*.py
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -13,7 +13,7 @@ exclude_patterns = [
    '**/fb/**',
    'functorch/docs/**',
    'functorch/examples/**',
-    'functorch/notebooks/**',
+    'functorch/docs/source/tutorials/**',
    'torch/_inductor/fx_passes/serialized_patterns/**',
    'torch/_inductor/autoheuristic/artifacts/**',
    'scripts/**',
@ -1568,7 +1568,6 @@ include_patterns = [
 exclude_patterns = [
    'caffe2/**',
    'functorch/docs/**',
-    'functorch/notebooks/**',
    'torch/_inductor/fx_passes/serialized_patterns/**',
    'torch/_inductor/autoheuristic/artifacts/**',
    'test/dynamo/cpython/**',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -22,7 +22,6 @@ COMMON_COPTS = [
    "-DHAVE_SHM_UNLINK=1",
    "-D_FILE_OFFSET_BITS=64",
    "-DUSE_FBGEMM",
-    "-DUSE_DISTRIBUTED",
    "-DAT_PER_OPERATOR_HEADERS",
    "-DATEN_THREADING=NATIVE",
    "-DNO_CUDNN_DESTROY_HANDLE",
@ -811,7 +810,7 @@ cc_library(
    name = "torch_python",
    srcs = libtorch_python_core_sources
        + if_cuda(libtorch_python_cuda_sources)
-        + if_cuda(libtorch_python_distributed_sources)
+        + libtorch_python_distributed_sources
        + GENERATED_AUTOGRAD_PYTHON,
    hdrs = glob([
        "torch/csrc/generic/*.cpp",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -181,8 +181,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
  set(CPU_POWER ON)
 endif()

-# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
-# tested and likely won't work without additional changes.
+# For non-supported platforms, turn USE_DISTRIBUTED off by default.
+# NB: USE_DISTRIBUTED simply disables the backend; distributed code
+# still gets built
 if(NOT LINUX AND NOT WIN32)
  set(USE_DISTRIBUTED
      OFF
@ -262,11 +263,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
-option(USE_DISTRIBUTED "Use distributed" ON)
+option(USE_DISTRIBUTED "Enable default distributed backends" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
-                       "USE_XPU;UNIX;NOT APPLE" OFF)
+                       "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@ -379,13 +380,6 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
                       OFF "USE_CUDA" OFF)
 cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
                        "CPU_AARCH64" OFF)
-# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
-set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
-if(LINUX AND CPU_AARCH64)
-  set(USE_PRIORITIZED_TEXT_DEFAULT ON)
-endif()
-cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
-  "${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)

 option(USE_MIMALLOC "Use mimalloc" OFF)
 # Enable third party mimalloc library to improve memory allocation performance
@ -438,11 +432,10 @@ if(WIN32)
      PATH_SUFFIXES lib
      NO_DEFAULT_PATH)
    if(NOT libuv_tmp_LIBRARY)
-      set(USE_DISTRIBUTED OFF)
      set(USE_GLOO OFF)
      message(
        WARNING
-          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
+          "Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
          "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
      )
    else()
@ -664,11 +657,6 @@ endif(MSVC)

 string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")

-# Set linker max-page-size to 64KiB on AArch64 Linux
-if(LINUX AND CPU_AARCH64)
-  add_link_options_if_supported("-z,max-page-size=0x10000")
-endif()
-
 # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
 # applicable to mobile are disabled by this variable. Setting
 # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
@ -1433,57 +1421,3 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
  install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
          DESTINATION "${CMAKE_INSTALL_BINDIR}")
 endif()
-
-if(USE_PRIORITIZED_TEXT_FOR_LD)
-  add_compile_options(
-    $<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
-    $<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
-  )
-  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
-  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
-
-  add_custom_command(
-    OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
-    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
-    DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
-    COMMENT "Generating prioritized text linker files"
-    VERBATIM
-  )
-
-  add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
-
-  if(BUILD_PYTHON)
-    set(LINKER_OPT_TARGETS torch_python)
-  endif()
-
-  if(NOT BUILD_LIBTORCHLESS)
-    list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
-    if(USE_CUDA)
-      list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
-    endif()
-    if(USE_XPU)
-      list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
-    endif()
-    if(USE_ROCM)
-      list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
-    endif()
-  endif()
-
-  foreach(tgt IN LISTS LINKER_OPT_TARGETS)
-    if(TARGET ${tgt})
-      add_dependencies("${tgt}" generate_linker_script)
-      target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
-      set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
-    else()
-       message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
-    endif()
-  endforeach()
-
-else()
-  if(LINUX AND CPU_AARCH64)
-    message(WARNING [[
-    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
-    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
-    ]])
-  endif()
-endif()
--- a/SECURITY.md
+++ b/SECURITY.md
@ -16,6 +16,8 @@ However, if you believe you have found a security vulnerability in PyTorch, we e

 Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new

+All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
+
 Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:

 https://www.facebook.com/whitehat
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@ -457,24 +457,9 @@ void gemm(
    return;
  }
 #endif
-  // for the fallback path, first compute gemm with beta = 0,
-  // and then add c in full precision.
-  int64_t c_size = n * m;
-  std::vector<float> float_c(c_size, 0.f);
  gemm_no_downcast_stub(
      at::kCPU, at::kBFloat16,
-      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
-  for (const auto j : c10::irange(n)) {
-    for (const auto i : c10::irange(m)) {
-      auto offset = j * ldc + i;
-      // beta == 0 won't propagate NaN from C
-      if (beta == 0.f) {
-        c[offset] = float_c[j * m + i];
-      } else {
-        c[offset] = beta * c[offset] + float_c[j * m + i];
-      }
-    }
-  }
+      transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }

 void gemm(
@ -493,24 +478,9 @@ void gemm(
    return;
  }
 #endif
-  // for the fallback path, first compute gemm with beta = 0,
-  // and then add c in full precision.
-  int64_t c_size = n * m;
-  std::vector<float> float_c(c_size, 0.f);
  gemm_no_downcast_stub(
      at::kCPU, at::kHalf,
-      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
-  for (const auto j : c10::irange(n)) {
-    for (const auto i : c10::irange(m)) {
-      auto offset = j * ldc + i;
-      // beta == 0 won't propagate NaN from C
-      if (beta == 0.f) {
-        c[offset] = float_c[j * m + i];
-      } else {
-        c[offset] = beta * c[offset] + float_c[j * m + i];
-      }
-    }
-  }
+      transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }

 void gemm(
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -1360,7 +1360,8 @@ Tensor outer(const Tensor& self, const Tensor& vec2) {
 #endif


-#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
+#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
+// Used by default on x86 platforms and on AArch64+ACL
 static inline int64_t get_mkldnn_matmul_min_dim() {
  static auto value = [&] {
    const int64_t default_min_dim = [&] {
@ -1395,8 +1396,6 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) {
  return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size;
 }
 #endif
-
-
 static void addmm_impl_cpu_(
    Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) {
  TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2);
@ -1772,8 +1771,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
    return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) ||
        (strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1]));
  };
-
-#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
+#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
+  // Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL
  bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
  if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) {
    try {
@ -1785,7 +1784,6 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
    }
  }
 #endif
-
  if (contraction_size * res_rows * res_cols < 400) {
    if (is_bmm_out) {
      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, batch1.scalar_type(), "bmm", [&] {
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@ -624,7 +624,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
  if (backend == BatchNormBackend::Miopen) {
    return std::tuple_cat(
             at::miopen_batch_norm(
-               input.contiguous(), weight.contiguous(), bias.contiguous(),
+               input.contiguous(input.suggest_memory_format()),
+               weight.contiguous(),
+               bias.contiguous(),
               running_mean.defined() ? running_mean.contiguous() : running_mean,
               running_var.defined() ? running_var.contiguous() : running_var,
               training, momentum, eps),
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@ -2174,7 +2174,7 @@ static void _scatter_via_index_put(
  if (self.dim() == 1 || broadcast_index) {
    Tensor squeezed = index;
    if (broadcast_index && index.dim() > 1) {
-      for (const auto d : c10::irange(index.dim())) {
+      for (int64_t d = index.dim() - 1; d >= 0; --d) {
        if (d == dim) {
          continue;
        }
--- a/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu
+++ b/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu
@ -36,7 +36,7 @@ void hardsigmoid_kernel(TensorIteratorBase& iter) {
            [zero, one_sixth, three, six] GPU_LAMBDA(
                scalar_t self_val) -> scalar_t {
              opmath_t x = static_cast<opmath_t>(self_val);
-              return std::min(std::max(x + three, zero), six) * one_sixth;
+              return std::min<opmath_t>(std::max<opmath_t>(x + three, zero), six) * one_sixth;
            });
      });
 }
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@ -317,6 +317,17 @@ void nonzero_static_cuda_out_impl(
    out_temp =
        Tensor(at::detail::empty_cuda({self.dim(), size}, out.options())).t();
  }
+  // If input has zero elements, avoid kernel grid calculations (which can
+  // produce zero divisors) and just fill the output with fill_value.
+  if (self.numel() == 0) {
+    if (need_to_copy) {
+      out_temp.fill_(fill_value);
+      out.copy_(out_temp);
+    } else {
+      out.fill_(fill_value);
+    }
+    return;
+  }
  int64_t* out_data_ptr = need_to_copy ? out_temp.mutable_data_ptr<int64_t>()
                                       : out.mutable_data_ptr<int64_t>();

--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@ -7,6 +7,7 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
 #include <ATen/ops/miopen_batch_norm_native.h>
 #include <ATen/ops/miopen_batch_norm_backward_native.h>
 #endif
@ -102,7 +103,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
    mode = miopenBNSpatial;
  }

-  auto output_t = at::empty(input->sizes(), input->options());
+  auto output_t = at::empty_like(input_t, input_t.options(), input_t.suggest_memory_format());
  TensorArg output{ output_t, "output", 0 };

  auto handle = getMiopenHandle();
@ -170,20 +171,15 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
    const std::optional<Tensor>& save_var_t_opt,
    double epsilon) {
  // See [Note: hacky wrapper removal for optional tensor]
-  const Tensor& running_mean =
-      running_mean_opt.value_or(Tensor());
-  const Tensor& running_var =
-      running_var_opt.value_or(Tensor());
-  const Tensor& save_mean_t =
-      save_mean_t_opt.value_or(Tensor());
-  const Tensor& save_var_t =
-      save_var_t_opt.value_or(Tensor());
+  const Tensor& save_mean_t = save_mean_t_opt.value_or(Tensor());
+  const Tensor& save_var_t = save_var_t_opt.value_or(Tensor());

-  TensorArg input{ input_t, "input", 1 },
-            grad_output{ grad_output_t, "grad_output", 2 },
-            weight{ weight_t, "weight", 3 },
-            save_mean{ save_mean_t, "save_mean", 4 },
-            save_var{ save_var_t, "save_var", 5 };
+  auto grad_output_contig =
+      grad_output_t.contiguous(input_t.suggest_memory_format());
+  TensorArg input{input_t, "input", 1},
+      grad_output{grad_output_contig, "grad_output", 2},
+      weight{weight_t, "weight", 3}, save_mean{save_mean_t, "save_mean", 4},
+      save_var{save_var_t, "save_var", 5};
  CheckedFrom c = "miopen_batch_norm_backward";

  checkAllDefined(c, {input, grad_output, weight, save_mean, save_var});
@ -195,7 +191,11 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
  }
  checkAllSameType(c, {input, grad_output});
  checkAllSameType(c, {weight, save_mean, save_var});
-  checkAllContiguous(c, {input, grad_output, save_mean, save_var});
+  // TODO: is weight required to be contiguous?
+  checkAllContiguous(c, {save_mean, save_var});
+  // TODO: TensorArg check should start handle memory format
+  TORCH_CHECK(input->is_contiguous(input->suggest_memory_format()));
+  TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format()));
  checkDimRange(c, input, 2, 6 /* exclusive */);
  checkSameSize(c, input, grad_output);
  auto num_features = input->size(1);
@ -210,7 +210,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
    mode = miopenBNSpatial;
  }

-  auto grad_input_t  = at::empty(input->sizes(), input->options());
+  auto grad_input_t  = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
  auto grad_weight_t = at::empty(weight->sizes(), weight->options());
  auto grad_bias_t   = at::empty(weight->sizes(), weight->options());

--- a/aten/src/ATen/native/mps/MPSGraphSonomaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphSonomaOps.h
@ -1,48 +0,0 @@
-#pragma once
-
-#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
-
-#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
-
-typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode) {
-  MPSGraphFFTScalingModeNone = 0L,
-  MPSGraphFFTScalingModeSize = 1L,
-  MPSGraphFFTScalingModeUnitary = 2L,
-};
-
-@interface FakeMPSGraphFFTDescriptor : NSObject<NSCopying>
-@property(readwrite, nonatomic) BOOL inverse;
-@property(readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode;
-@property(readwrite, nonatomic) BOOL roundToOddHermitean;
-+ (nullable instancetype)descriptor;
-@end
-
-@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor;
-
-@interface MPSGraph (SonomaOps)
- (MPSGraphTensor* _Nonnull)conjugateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)realPartOfTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)fastFourierTransformWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                                      axes:(NSArray<NSNumber*>* _Nonnull)axes
-                                                descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
-                                                      name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)realToHermiteanFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                                    axes:(NSArray<NSNumber*>* _Nonnull)axes
-                                              descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
-                                                    name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)HermiteanToRealFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                                    axes:(NSArray<NSNumber*>* _Nonnull)axes
-                                              descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
-                                                    name:(NSString* _Nullable)name;
-@end
-
-// define BFloat16 enums for MacOS13
-#define MPSDataTypeBFloat16 ((MPSDataType)(MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16))
-
-// define Metal version
-#define MTLLanguageVersion3_1 ((MTLLanguageVersion)((3 << 16) + 1))
-#endif
--- a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
@ -1,196 +0,0 @@
-#pragma once
-#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
-
-// TODO: Remove me when moved to MacOS 13
-#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
-
-@interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying>
-
-@property(readwrite, nonatomic) NSUInteger strideInX;
-@property(readwrite, nonatomic) NSUInteger strideInY;
-@property(readwrite, nonatomic) NSUInteger strideInZ;
-@property(readwrite, nonatomic) NSUInteger dilationRateInX;
-@property(readwrite, nonatomic) NSUInteger dilationRateInY;
-@property(readwrite, nonatomic) NSUInteger dilationRateInZ;
-
-@property(readwrite, nonatomic) NSUInteger paddingLeft;
-@property(readwrite, nonatomic) NSUInteger paddingRight;
-@property(readwrite, nonatomic) NSUInteger paddingTop;
-@property(readwrite, nonatomic) NSUInteger paddingBottom;
-@property(readwrite, nonatomic) NSUInteger paddingFront;
-@property(readwrite, nonatomic) NSUInteger paddingBack;
-
-@property(readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle;
-@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout;
-@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout;
-
-@property(readwrite, nonatomic) NSUInteger groups;
-
-@end
-
-@compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor;
-
-#endif
-
-@interface MPSGraph (VenturaOps)
-
-#if !defined(__MAC_13_0) && (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
-
-typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode) {
-  MPSGraphResizeNearestRoundingModeRoundPreferCeil = 0L,
-  MPSGraphResizeNearestRoundingModeRoundPreferFloor = 1L,
-  MPSGraphResizeNearestRoundingModeCeil = 2L,
-  MPSGraphResizeNearestRoundingModeFloor = 3L,
-  MPSGraphResizeNearestRoundingModeRoundToEven = 4L,
-  MPSGraphResizeNearestRoundingModeRoundToOdd = 5L,
-};
-
-// Define complex enums for MacOS 12
-#define MPSDataTypeComplexBit 0x01000000
-#define MPSDataTypeComplexFloat32 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64))
-#define MPSDataTypeComplexFloat16 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32))
-#endif
-
- (MPSGraphTensor* _Nonnull)convolution3DWithSourceTensor:(MPSGraphTensor* _Nonnull)source
-                                            weightsTensor:(MPSGraphTensor* _Nonnull)weights
-                                               descriptor:(MPSGraphConvolution3DOpDescriptor* _Nonnull)descriptor
-                                                     name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)
-    convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
-                                          weightsTensor:(MPSGraphTensor* _Nonnull)weights
-                                            outputShape:(MPSShape* _Nonnull)outputShape
-                           forwardConvolutionDescriptor:
-                               (MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
-                                                   name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)
-    convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
-                                              sourceTensor:(MPSGraphTensor* _Nonnull)source
-                                               outputShape:(MPSShape* _Nonnull)outputShape
-                              forwardConvolutionDescriptor:
-                                  (MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
-                                                      name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                               axis:(NSInteger)axis
-                                               name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                      axis:(NSInteger)axis
-                                      name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                      axis:(NSInteger)axis
-                                descending:(BOOL)descending
-                                      name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
-                                descending:(BOOL)descending
-                                      name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
-                                      name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                         axis:(NSInteger)axis
-                                         name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                         axis:(NSInteger)axis
-                                   descending:(BOOL)descending
-                                         name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                   axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
-                                   descending:(BOOL)descending
-                                         name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                   axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
-                                         name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)inverseOfTensor:(MPSGraphTensor* _Nonnull)inputTensor name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
-                                         sizeTensor:(MPSGraphTensor* _Nonnull)size
-                                nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                       centerResult:(BOOL)centerResult
-                                       alignCorners:(BOOL)alignCorners
-                                             layout:(MPSGraphTensorNamedDataLayout)layout
-                                               name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
-                                         sizeTensor:(MPSGraphTensor* _Nonnull)size
-                                  scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
-                                nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                             layout:(MPSGraphTensorNamedDataLayout)layout
-                                               name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
-                                          sizeTensor:(MPSGraphTensor* _Nonnull)size
-                                        centerResult:(BOOL)centerResult
-                                        alignCorners:(BOOL)alignCorners
-                                              layout:(MPSGraphTensorNamedDataLayout)layout
-                                                name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
-                                          sizeTensor:(MPSGraphTensor* _Nonnull)size
-                                   scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
-                                              layout:(MPSGraphTensorNamedDataLayout)layout
-                                                name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
-                                                      input:(MPSGraphTensor* _Nonnull)input
-                                        nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                               centerResult:(BOOL)centerResult
-                                               alignCorners:(BOOL)alignCorners
-                                                     layout:(MPSGraphTensorNamedDataLayout)layout
-                                                       name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
-                                                      input:(MPSGraphTensor* _Nonnull)input
-                                          scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
-                                        nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                                     layout:(MPSGraphTensorNamedDataLayout)layout
-                                                       name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
-                                                       input:(MPSGraphTensor* _Nonnull)input
-                                                centerResult:(BOOL)centerResult
-                                                alignCorners:(BOOL)alignCorners
-                                                      layout:(MPSGraphTensorNamedDataLayout)layout
-                                                        name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
-                                                       input:(MPSGraphTensor* _Nonnull)input
-                                           scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
-                                                      layout:(MPSGraphTensorNamedDataLayout)layout
-                                                        name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
-                                      coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
-                                                layout:(MPSGraphTensorNamedDataLayout)layout
-                                  normalizeCoordinates:(BOOL)normalizeCoordinates
-                                   relativeCoordinates:(BOOL)relativeCoordinates
-                                          alignCorners:(BOOL)alignCorners
-                                           paddingMode:(MPSGraphPaddingMode)paddingMode
-                                          samplingMode:(MPSGraphResizeMode)samplingMode
-                                         constantValue:(double)constantValue
-                                                  name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
-                                      coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
-                                                layout:(MPSGraphTensorNamedDataLayout)layout
-                                  normalizeCoordinates:(BOOL)normalizeCoordinates
-                                   relativeCoordinates:(BOOL)relativeCoordinates
-                                          alignCorners:(BOOL)alignCorners
-                                           paddingMode:(MPSGraphPaddingMode)paddingMode
-                                   nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                         constantValue:(double)constantValue
-                                                  name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)truncateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
-
-@end
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -9,8 +9,6 @@
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/mps/MPSGraphSequoiaOps.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>
 #include <fmt/ranges.h>
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@ -8,8 +8,6 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/operations/BinaryKernel.h>
-// For MTLLanguageVersion_3_1
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <fmt/format.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@ -1,23 +1,12 @@
 //  Copyright © 2022 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/ConvUtils.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/_mps_convolution_native.h>
 #include <ATen/ops/_mps_convolution_transpose_native.h>
 #include <ATen/ops/mps_convolution_backward_native.h>
 #include <ATen/ops/mps_convolution_transpose_backward_native.h>
-
-#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
-
-@implementation FakeMPSGraphConvolution3DOpDescriptor
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
-  return self;
-}
-
-@end
-
-#endif
+#include <fmt/format.h>

 namespace at::native {

@ -50,11 +39,9 @@ static void fill_conv3d_desc(MPSGraphConvolution3DOpDescriptor* descriptor_,
  descriptor_.paddingFront = paddingDepth;
  descriptor_.paddingBack = paddingDepth;

-  // PyTorch always uses NCDHW memory layout for 3D tensors
-  descriptor_.dataLayout = (MPSGraphTensorNamedDataLayout)7L; // MPSGraphTensorNamedDataLayoutNCDHW;
+  descriptor_.dataLayout = MPSGraphTensorNamedDataLayoutNCDHW;

-  // PyTorch always uses OIDHW memory layout for 3D weights
-  descriptor_.weightsLayout = (MPSGraphTensorNamedDataLayout)9L; // MPSGraphTensorNamedDataLayoutOIDHW;
+  descriptor_.weightsLayout = MPSGraphTensorNamedDataLayoutOIDHW;

  descriptor_.groups = groups; // not yet tested in Xcode/C++
 }
@ -186,18 +173,6 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
    if (bias_defined)
      bias_shape = bias_opt.value().sizes();

-    std::string mem_format_key;
-    switch (memory_format) {
-      case at::MemoryFormat::Contiguous:
-        mem_format_key = "Contiguous";
-        break;
-      case at::MemoryFormat::ChannelsLast:
-        mem_format_key = "ChannelsLast";
-        break;
-      default:
-        assert(0 && "Check should have been done earlier\n");
-    }
-
    std::string bias_shape_key;
    if (bias_defined) {
      bias_shape_key = std::to_string(bias_shape[0]);
@ -205,20 +180,16 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
      bias_shape_key = "nobias";
    }

-    std::string key;
-    if (is3DConv) {
-      key = "mps_3d_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
-          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
-          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
-          mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
-
-    } else {
-      key = "mps_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
-          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
-          mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
-    }
+    std::string key = fmt::format("mps_{}convolution:{}:{}:{}:{}:{}:{}:{}:{}",
+                                  is3DConv ? "3d_" : "",
+                                  getArrayRefString(stride),
+                                  getArrayRefString(dilation),
+                                  getArrayRefString(padding),
+                                  groups,
+                                  is_channels_last,
+                                  mps::getTensorsStringKey({input_t, weight_t}),
+                                  bias_defined,
+                                  bias_shape_key);

    MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
    MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
@ -400,33 +371,15 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
  @autoreleasepool {
    MPSStream* stream = getCurrentMPSStream();

-    std::string mem_format_key;
-    switch (memory_format) {
-      case at::MemoryFormat::Contiguous:
-        mem_format_key = "Contiguous";
-        break;
-      case at::MemoryFormat::ChannelsLast:
-        mem_format_key = "ChannelsLast";
-        break;
-      default:
-        assert(0 && "Check should have been done earlier\n");
-    }
-
    MPSShape* mps_input_shape = getMPSShape(input_size);
-    std::string key;
-    if (is3DConv) {
-      key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          ":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
-          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
-          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
-          getTensorsStringKey({grad_output_t, weight_t});
-
-    } else {
-      key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
-          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
-          getTensorsStringKey({grad_output_t, weight_t});
-    }
+    std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
+                                  is3DConv ? "3d_" : "",
+                                  getArrayRefString(stride),
+                                  getArrayRefString(dilation),
+                                  getArrayRefString(padding),
+                                  groups,
+                                  is_channels_last,
+                                  getTensorsStringKey({grad_output_t, weight_t}));
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      auto gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
      auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
@ -551,19 +504,13 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
    MPSStream* stream = getCurrentMPSStream();

    MPSShape* mps_weight_shape = getMPSShape(weight_size);
-    std::string key;
-    if (is3DConv) {
-      key = "mps_3d_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
-          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
-          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" +
-          getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
-    } else {
-      key = "mps_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
-          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" +
-          getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
-    }
+    std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
+                                  is3DConv ? "3d_" : "",
+                                  getArrayRefString(stride),
+                                  getArrayRefString(dilation),
+                                  getArrayRefString(padding),
+                                  groups,
+                                  getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSShape* inputShape = getMPSShape(input_t);
      bool isDepthwiseConv =
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@ -2,7 +2,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/mps/Copy.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/_copy_from_and_resize_native.h>
 #include <ATen/ops/_copy_from_native.h>
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@ -5,8 +5,6 @@
 #include <ATen/native/DistributionTemplates.h>
 #include <ATen/native/Distributions.h>
 #include <ATen/native/TensorFactories.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
+++ b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
@ -1,6 +1,4 @@
 #include <ATen/native/SpectralOpsUtils.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
@ -12,20 +10,6 @@
 #include <ATen/ops/_fft_r2c_native.h>
 #endif

-#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
-@implementation FakeMPSGraphFFTDescriptor
-+ (nullable instancetype)descriptor {
-  // Redispatch the constructor to the actual implementation
-  id desc = NSClassFromString(@"MPSGraphFFTDescriptor");
-  return (FakeMPSGraphFFTDescriptor*)[desc descriptor];
-}
-
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
-  return self;
-}
-@end
-#endif
-
 namespace at::native {
 namespace {
 MPSGraphFFTScalingMode normalization_to_ScalingMode(int64_t normalization) {
--- a/aten/src/ATen/native/mps/operations/GridSampler.mm
+++ b/aten/src/ATen/native/mps/operations/GridSampler.mm
@ -2,7 +2,6 @@
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/GridSamplerUtils.h>
 #include <ATen/native/Pool.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/kernels/GridSampler.h>

--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@ -17,7 +17,6 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorAdvancedIndexing.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <c10/util/SmallVector.h>
 #include <c10/util/irange.h>
 #include <fmt/format.h>
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@ -6,9 +6,7 @@
 #include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
-// For MTLLanguageVersion_3_1
 #include <ATen/native/mps/MPSGraphSequoiaOps.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@ -4,7 +4,6 @@
 #include <ATen/TensorUtils.h>
 #include <ATen/native/Pool.h>
 #include <ATen/native/ReduceOpsUtils.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <c10/util/irange.h>

@ -617,6 +616,7 @@ static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) {
  // we allocate 1 here due to MacOS13 bug for gather MPSGraph op, look below for the error
  Tensor output_t = at::empty({1}, input_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
  if (output_t.numel() == 0 || num_in_elements == 0) {
+    output_t.fill_(std::numeric_limits<float>::quiet_NaN());
    return output_t;
  }

--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@ -4,7 +4,6 @@
 #include <ATen/WrapDimUtils.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@ -5,7 +5,6 @@
 #include <ATen/native/SortingUtils.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@ -2,8 +2,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/mps/Copy.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@ -1,7 +1,6 @@
 //  Copyright © 2022 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Resize.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@ -1,7 +1,6 @@
 //  Copyright © 2023 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/UpSample.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>

--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@ -4,8 +4,6 @@
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/Resize.h>
-// For MTLLanguageVersion_3_1
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>

--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -1414,7 +1414,7 @@
 - func: cat(Tensor[] tensors, int dim=0) -> Tensor
  structured_delegate: cat.out
  dispatch:
-    SparseCPU, SparseCUDA: cat_sparse
+    SparseCPU, SparseCUDA, SparseMPS: cat_sparse
    QuantizedCPU: cat_quantized_cpu
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested
  tags: core
@ -1798,7 +1798,7 @@
  device_guard: False
  dispatch:
    MkldnnCPU: copy_mkldnn_
-    SparseCPU, SparseCUDA: copy_sparse_wrapper_
+    SparseCPU, SparseCUDA, SparseMPS: copy_sparse_wrapper_
    CompositeExplicitAutograd: copy_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_
@ -2160,7 +2160,7 @@
  variants: function, method
  structured_delegate: div.out
  dispatch:
-    SparseCPU, SparseCUDA: div_sparse
+    SparseCPU, SparseCUDA, SparseMPS: div_sparse
    ZeroTensor: div_zerotensor
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor
  tags: [core, pointwise]
@ -2170,7 +2170,7 @@
  variants: method
  structured_delegate: div.out
  dispatch:
-    SparseCPU, SparseCUDA: div_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: div_sparse_
  tags: pointwise

 - func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@ -2179,7 +2179,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS, MTIA: div_out
-    SparseCPU, SparseCUDA: div_out_sparse_zerodim
+    SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
  tags: pointwise

 - func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
@ -2187,7 +2187,7 @@
  variants: function, method
  structured_delegate: div.out_mode
  dispatch:
-    SparseCPU, SparseCUDA: div_sparse
+    SparseCPU, SparseCUDA, SparseMPS: div_sparse
  tags: [core, pointwise]

 - func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
@ -2195,7 +2195,7 @@
  variants: method
  structured_delegate: div.out_mode
  dispatch:
-    SparseCPU, SparseCUDA: div_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: div_sparse_
  tags: pointwise

 - func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
@ -2204,7 +2204,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: div_out_mode
-    SparseCPU, SparseCUDA: div_out_sparse_zerodim
+    SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
  tags: pointwise

 # For C++ only, until we have conversion from C++ numbers to Tensor
@ -2517,7 +2517,7 @@
  dispatch:
    CompositeExplicitAutograd: empty_like
    QuantizedCPU, QuantizedCUDA: empty_like_quantized
-    SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: empty_like_sparse_coo
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
  autogen: empty_like.out
@ -2768,20 +2768,20 @@
  variants: function, method
  dispatch:
    CPU, CUDA, MPS, MTIA: floor_divide
-    SparseCPU, SparseCUDA: floor_divide_sparse
+    SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse

 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  variants: method
  dispatch:
    CPU, CUDA, MPS: floor_divide_
-    SparseCPU, SparseCUDA: floor_divide_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse_

 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
    CPU, CUDA, MPS: floor_divide_out
-    SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
+    SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim

 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
  device_check: NoCheck   # TensorIterator
@ -4273,7 +4273,7 @@
  structured_delegate: mul.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA: mul_sparse
+    SparseCPU, SparseCUDA, SparseMPS: mul_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr
    MkldnnCPU: mkldnn_mul
    ZeroTensor: mul_zerotensor
@ -4285,7 +4285,7 @@
  structured_delegate: mul.out
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA: mul_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: mul_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_
    MkldnnCPU: mkldnn_mul_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor
@ -4299,6 +4299,7 @@
    CPU, CUDA, MPS, MTIA: mul_out
    SparseCPU: mul_out_sparse_cpu
    SparseCUDA: mul_out_sparse_cuda
+    SparseMPS: mul_out_sparse_mps
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_out_sparse_csr
    MkldnnCPU: mkldnn_mul_out
  tags: pointwise
@ -5848,7 +5849,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: sum
-    SparseCPU, SparseCUDA, SparseMeta: sum_coo
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
  autogen: sum.out

@ -5859,7 +5860,7 @@
  variants: function, method
  dispatch:
    NestedTensorCPU: NestedTensor_sum_dim_CPU
-    SparseCPU, SparseCUDA: sum_sparse_coo
+    SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
  tags: core

@ -6491,7 +6492,7 @@
  device_guard: False
  dispatch:
    CompositeExplicitAutograd: unsqueeze
-    SparseCPU, SparseCUDA: unsqueeze_sparse
+    SparseCPU, SparseCUDA, SparseMPS: unsqueeze_sparse
    QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
  tags: core
@ -6975,7 +6976,7 @@
    CPU, CUDA: sub_out
    MPS: sub_out_mps
    MTIA: sub_out_mtia
-    SparseCPU, SparseCUDA: sub_out_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sub_out_sparse
  tags: pointwise

 - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
@ -6983,7 +6984,7 @@
  variants: function, method
  structured_delegate: sub.out
  dispatch:
-    SparseCPU, SparseCUDA: sub_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sub_sparse
    ZeroTensor: sub_zerotensor
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor
  tags: [core, pointwise]
@ -6993,7 +6994,7 @@
  variants: method
  structured_delegate: sub.out
  dispatch:
-    SparseCPU, SparseCUDA: sub_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: sub_sparse_
  tags: pointwise
 # For C++ only, until we have conversion from C++ numbers to Tensor

@ -10258,7 +10259,7 @@
  structured_delegate: any.all_out
  variants: method, function
  dispatch:
-    SparseCPU, SparseCUDA: any_sparse
+    SparseCPU, SparseCUDA, SparseMPS: any_sparse
  tags: core

 - func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -10342,7 +10343,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA: pow_Tensor_Scalar_out
-    SparseCPU, SparseCUDA: pow_out_sparse_scalar
+    SparseCPU, SparseCUDA, SparseMPS: pow_out_sparse_scalar
    MPS: pow_tensor_scalar_out_mps
  tags: pointwise

@ -10351,7 +10352,7 @@
  structured_delegate: pow.Tensor_Scalar_out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA: pow_sparse_scalar
+    SparseCPU, SparseCUDA, SparseMPS: pow_sparse_scalar
  tags: [core, pointwise]

 - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
--- a/aten/src/ATen/native/sparse/SoftMax.cpp
+++ b/aten/src/ATen/native/sparse/SoftMax.cpp
@ -2,6 +2,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 #include <ATen/Dispatch.h>
+#include <ATen/AccumulateType.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/native/sparse/ParamUtils.h>
 #include <ATen/native/SparseTensorUtils.h>
@ -295,6 +296,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
    to exp functions as well as reuse of softmax implementation for
    log_softmax.
  */
+  using accscalar_t = at::acc_type<scalar_t, false>;
  auto sparse_dim = input.sparse_dim();
  auto indices = input._indices().contiguous();
  auto values = input._values().contiguous();
@ -340,14 +342,14 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
          continue;

        /* Prepare scratch space */
-        std::vector<scalar_t> mx_row(nvalues, -std::numeric_limits<scalar_t>::infinity());
-        std::vector<scalar_t> exp_sums_row(nvalues, 0);
+        std::vector<accscalar_t> mx_row(nvalues, -std::numeric_limits<accscalar_t>::infinity());
+        std::vector<accscalar_t> exp_sums_row(nvalues, 0);

        /* Compute mx */
        for (int64_t i : pool_indices) {
          auto values_row = values_accessor[i];
          for (const auto j : c10::irange(nvalues)) {
-            mx_row[j] = std::max(mx_row[j], values_row[j]);
+            mx_row[j] = std::max(mx_row[j], accscalar_t(values_row[j]));
          }
        }

--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@ -10,6 +10,7 @@
 #include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
 #include <ATen/ops/cat.h>
 #include <ATen/ops/add_native.h>
+#include <ATen/ops/mul_native.h>
 #include <ATen/ops/empty_native.h>
 #include <ATen/ops/zeros_native.h>
 #include <ATen/ops/result_type.h>
@ -20,10 +21,265 @@
 namespace at::native {

 using namespace at::sparse;
+using namespace mps;

-Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha);
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/Mul_metallib.h>
+#endif

-Tensor& add_out_dense_sparse_mps(
+static SparseTensor& mul_out_dense_sparse_mps(
+    const Tensor& dense,
+    const Tensor& sparse,
+    SparseTensor& out) {
+
+  TORCH_CHECK(sparse.is_sparse(), "mul: expected 'sparse' to be sparse COO");
+  TORCH_CHECK(sparse.is_mps(), "mul: expected 'sparse' to be MPS, got ", sparse.device());
+  TORCH_CHECK(out.is_mps(), "mul: expected 'out' to be MPS, got ", out.device());
+
+  const bool scalar_like = (dense.dim() == 0) || (dense.numel() == 1);
+  TORCH_CHECK(dense.is_mps() || scalar_like,
+              "mul: expected 'dense' to be MPS or scalar-like, got ", dense.device());
+
+  const int64_t nnz = sparse._nnz();
+  out.resize_as_(sparse);
+
+  auto commonDtype = at::result_type(dense, sparse);
+  TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
+              "Can't convert result type ", commonDtype, " to output ", out.scalar_type());
+
+  auto indices = sparse._indices().contiguous();
+  auto values  = sparse._values().to(commonDtype).contiguous();
+
+  if (nnz == 0) {
+    auto empty_vals = values.narrow(0, 0, 0);
+    alias_into_sparse(out,
+                      indices.narrow(1, 0, 0),
+                      (out.scalar_type() == commonDtype) ? empty_vals
+                                                          : empty_vals.to(out.scalar_type()));
+    out._coalesced_(sparse.is_coalesced());
+    return out;
+  }
+
+  if (scalar_like) {
+    auto scalar = dense;
+    if (dense.numel() == 1 && dense.dim() > 0) {
+      scalar = dense.view({});
+    }
+    scalar = scalar.to(values.options());
+    auto out_vals = values.mul(scalar);
+    if (out.scalar_type() != commonDtype) {
+      out_vals = out_vals.to(out.scalar_type());
+    }
+
+    alias_into_sparse(out, indices, out_vals);
+    out._coalesced_(sparse.is_coalesced());
+    return out;
+  }
+
+  TORCH_CHECK(dense.sizes().equals(sparse.sizes()),
+              "mul(dense, sparse): sizes must match exactly (no broadcasting): ",
+              dense.sizes(), " vs ", sparse.sizes());
+
+  const int64_t ndim_i = sparse.sparse_dim();
+  const int64_t ndim = dense.dim();
+  TORCH_CHECK(
+    ndim_i <= ndim,
+    "mul(dense, sparse): sparse_dim=", ndim_i, " exceeds dense.dim()=", ndim);
+
+  // Prepare shapes
+  int64_t view_rows = 1, view_cols = 1;
+  for (int64_t i = 0; i < ndim_i; ++i) view_rows *= sparse.size(i);
+  for (int64_t i = ndim_i; i < ndim; ++i) view_cols *= sparse.size(i);
+
+  auto dense_mps = dense.to(commonDtype).contiguous().reshape({view_rows, view_cols});
+  auto out_vals = at::empty_like(values, values.options());
+
+  const uint32_t u_view_cols = static_cast<uint32_t>(view_cols);
+  const uint32_t u_nnz = static_cast<uint32_t>(nnz);
+  const uint32_t u_ndim_i = static_cast<uint32_t>(ndim_i);
+
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("dense_sparse_mul_kernel_" + mps::scalarToMetalTypeString(values));
+      auto computeEncoder = stream->commandEncoder();
+      [computeEncoder setComputePipelineState:pso];
+
+      const uint32_t gridWidth = u_view_cols;
+      const uint32_t gridDepth = u_nnz;
+      MTLSize gridSize = MTLSizeMake(gridWidth, 1, gridDepth);
+
+      const uint32_t maxThreadsPerGroup = pso.maxTotalThreadsPerThreadgroup;
+      const uint32_t tew = pso.threadExecutionWidth;
+      uint32_t tgWidth  = std::min(gridWidth, tew);
+      MTLSize threadgroupSize = MTLSizeMake(tgWidth, 1, 1);
+
+      mtl_setArgs(
+        computeEncoder,
+        dense_mps,
+        values,
+        out_vals,
+        indices,
+        sparse.sizes(),
+        std::array<uint32_t, 3>{u_nnz, u_ndim_i, u_view_cols}
+      );
+
+      [computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadgroupSize];
+    }
+  });
+
+  Tensor final_vals = out_vals;
+  if (out.scalar_type() != commonDtype) {
+    final_vals = final_vals.to(out.scalar_type());
+  }
+
+  alias_into_sparse(out, indices, final_vals);
+  out._coalesced_(sparse.is_coalesced());
+  return out;
+}
+
+
+SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTensor& r_) {
+  TORCH_CHECK(r_.is_mps(), "mul: expected 'out' to be MPS, but got ", r_.device());
+
+  // Dense x sparse fallback (keep dense first)
+  if (!t_.is_sparse() || !src_.is_sparse()) {
+    const Tensor& dense  = t_.is_sparse() ? src_ : t_;
+    const Tensor& sparse = t_.is_sparse() ? t_   : src_;
+    return mul_out_dense_sparse_mps(dense, sparse, r_);
+  }
+
+  TORCH_CHECK(t_.is_mps(),   "mul: expected 'self' to be MPS, but got ", t_.device());
+  TORCH_CHECK(src_.is_mps(), "mul: expected 'other' to be MPS, but got ", src_.device());
+  TORCH_CHECK(t_.sparse_dim() == src_.sparse_dim(),
+              "mul(sparse, sparse): must have same sparse_dim, got ",
+              t_.sparse_dim(), " vs ", src_.sparse_dim());
+  TORCH_CHECK(t_.sizes().equals(src_.sizes()),
+              "mul(sparse, sparse): sizes must match exactly (no broadcasting).");
+
+  // Coalesce and early-exit on structurally empty operands
+  auto lhs = t_.coalesce();
+  auto rhs = src_.coalesce();
+  const int64_t lhs_nnz = lhs._nnz();
+  const int64_t rhs_nnz = rhs._nnz();
+  if (!lhs_nnz || !rhs_nnz) {
+    r_.resize_as_(lhs);
+    return r_.zero_();
+  }
+
+  // dtype checks and promotion
+  auto commonDtype = at::result_type(lhs, rhs);
+  TORCH_CHECK(canCast(commonDtype, r_.scalar_type()),
+              "Can't convert result type ", commonDtype, " to output ", r_.scalar_type());
+
+  const int64_t ndim_i = lhs.sparse_dim();
+
+  // ndim_i == 0, at most one structural entry
+  if (ndim_i == 0) {
+    r_.resize_as_(lhs);
+    const bool has = (lhs_nnz && rhs_nnz);
+
+    auto out_indices = lhs._indices().narrow(1, 0, has ? 1 : 0);
+
+    Tensor lhs_vals = lhs._values().to(commonDtype);
+    Tensor rhs_vals = rhs._values().to(commonDtype);
+    lhs_vals = lhs_vals.narrow(0, 0, has ? 1 : 0);
+    rhs_vals = rhs_vals.narrow(0, 0, has ? 1 : 0);
+
+    Tensor out_values = lhs_vals.mul(rhs_vals);
+    if (r_.scalar_type() != commonDtype) {
+      out_values = out_values.to(r_.scalar_type());
+    }
+
+    alias_into_sparse(r_, out_indices, out_values);
+    r_._coalesced_(true);
+    return r_;
+  }
+
+  // General path, intersect keys, then gather + multiply on GPU
+  const auto device = r_.device();
+  auto stream = getCurrentMPSStream();
+
+  auto lhs_indices = lhs._indices();
+  auto rhs_indices = rhs._indices();
+  auto lhs_values  = lhs._values().to(commonDtype);
+  auto rhs_values  = rhs._values().to(commonDtype);
+
+  // Flatten sparse indices to keys
+  auto lhs_keys = flatten_indices(lhs_indices, lhs.sizes());
+  auto rhs_keys = flatten_indices(rhs_indices, rhs.sizes());
+
+  // Intersect sorted keys (search the shorter in the longer)
+  const bool A_is_lhs = (lhs_nnz <= rhs_nnz);
+  const int64_t lenA = A_is_lhs ? lhs_nnz : rhs_nnz;
+  const int64_t lenB = A_is_lhs ? rhs_nnz : lhs_nnz;
+  auto A_keys = A_is_lhs ? lhs_keys : rhs_keys;
+  auto B_keys = A_is_lhs ? rhs_keys : lhs_keys;
+
+  auto outA_idx = at::empty({lenA}, at::device(device).dtype(kLong));
+  auto outB_idx = at::empty({lenA}, at::device(device).dtype(kLong));
+  auto counter = at::zeros({1}, at::device(device).dtype(kInt));
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+      mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
+                  static_cast<uint32_t>(lenB), A_is_lhs);
+      mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
+    }
+  });
+
+  const uint32_t M = counter.item<int32_t>(); // number of structural matches
+
+  r_.resize_as_(lhs);
+
+  auto out_indices = at::empty({ndim_i, static_cast<int64_t>(M)}, at::device(device).dtype(at::kLong));
+  auto lhs_match = outA_idx.narrow(0, 0, M);
+  auto rhs_match = outB_idx.narrow(0, 0, M);
+  auto out_val_sizes = lhs_values.sizes().vec();
+  out_val_sizes[0] = static_cast<int64_t>(M);
+  auto out_values = at::empty(out_val_sizes, lhs_values.options());
+
+  const uint32_t cols = static_cast<uint32_t>(
+      lhs_values.numel() / std::max<int64_t>(1, lhs_nnz));
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc(
+          "fused_gather_mul_kernel_" + mps::scalarToMetalTypeString(lhs_values));
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+
+      const uint32_t tew  = pso.threadExecutionWidth;
+      uint32_t tgW = std::min(cols, tew);
+      MTLSize grid = MTLSizeMake(cols, 1, M);
+      MTLSize tgs  = MTLSizeMake(tgW, 1, 1);
+
+      mtl_setArgs(enc,
+                  lhs_values, rhs_values,
+                  lhs_match, rhs_match,
+                  lhs_indices, out_indices,
+                  out_values,
+                  std::array<uint32_t, 2>{static_cast<uint32_t>(ndim_i), static_cast<uint32_t>(lhs_nnz)},
+                  std::array<uint32_t, 2>{M, cols});
+      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
+    }
+  });
+
+  if (r_.scalar_type() != commonDtype) {
+    out_values = out_values.to(r_.scalar_type());
+  }
+
+  alias_into_sparse(r_, out_indices, out_values);
+  r_._coalesced_(true);
+  return r_;
+}
+
+static Tensor& add_out_dense_sparse_mps(
    Tensor& out,
    const Tensor& dense,
    const SparseTensor& sparse,
--- a/aten/src/ATen/native/sparse/mps/kernels/Mul.metal
+++ b/aten/src/ATen/native/sparse/mps/kernels/Mul.metal
@ -0,0 +1,150 @@
+#include <metal_stdlib>
+#include <c10/metal/indexing.h>
+using namespace metal;
+
+
+template <typename T>
+kernel void dense_sparse_mul_kernel(
+    device const T* dense         [[buffer(0)]],
+    device const T* values        [[buffer(1)]],
+    device T* out_values          [[buffer(2)]],
+    device const long* indices    [[buffer(3)]],
+    device const long* sizes      [[buffer(4)]],
+    constant uint3& sparse_params [[buffer(5)]],
+    uint3 gid                     [[thread_position_in_grid]])
+{
+  uint col = gid.x;
+  uint i = gid.z;
+  uint nnz = sparse_params.x;
+  uint ndim_i = sparse_params.y;
+  uint view_cols = sparse_params.z;
+
+  long key = 0;
+  for (uint d = 0; d < ndim_i; ++d) {
+    long idx_d = indices[(ulong)d * (ulong)nnz + (ulong)i];
+    const auto sz_d  = sizes[d];
+    key = key * sz_d + idx_d;
+  }
+
+  ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
+  ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;
+
+  const auto a = static_cast<float>(values[val_idx]);
+  const auto b = static_cast<float>(dense[dense_idx]);
+  out_values[val_idx] = static_cast<T>(a * b);
+}
+
+kernel void intersect_binary_search(
+    device const long*  keysA        [[buffer(0)]],
+    device const long*  keysB        [[buffer(1)]],
+    device long*        outA_idx     [[buffer(2)]],
+    device long*        outB_idx     [[buffer(3)]],
+    device atomic_uint* counter      [[buffer(4)]],
+    constant uint&      lenB         [[buffer(5)]],
+    constant bool&      A_is_lhs     [[buffer(6)]],
+    uint3               tid_in_grid  [[thread_position_in_grid]])
+{
+  uint gid = tid_in_grid.x;
+
+  long key = keysA[gid];
+
+  // lower_bound in B
+  uint lo = 0;
+  uint hi = lenB;
+  while (lo < hi) {
+    uint mid = (lo + hi) >> 1;
+    long v = keysB[mid];
+    if (v < key) lo = mid + 1;
+    else         hi = mid;
+  }
+
+  if (lo < lenB && keysB[lo] == key) {
+    uint pos = atomic_fetch_add_explicit(counter, 1u, memory_order_relaxed);
+    if (A_is_lhs) {
+      outA_idx[pos] = (long)gid;
+      outB_idx[pos] = (long)lo;
+    } else {
+      outA_idx[pos] = (long)lo;
+      outB_idx[pos] = (long)gid;
+    }
+  }
+}
+
+
+template <typename T>
+kernel void fused_gather_mul_kernel(
+    device const T*    lhs_vals      [[buffer(0)]],
+    device const T*    rhs_vals      [[buffer(1)]],
+    device const long* lhs_sel       [[buffer(2)]],
+    device const long* rhs_sel       [[buffer(3)]],
+    device const long* lhs_indices   [[buffer(4)]],
+    device long*       out_indices   [[buffer(5)]],
+    device T*          out_vals      [[buffer(6)]],
+    constant uint2&    dims_input    [[buffer(7)]],
+    constant uint2&    dims_output   [[buffer(8)]],
+    uint3              gid           [[thread_position_in_grid]])
+{
+  const uint col = gid.x;
+  const uint k = gid.z;
+  const uint n_dim_i = dims_input.x;
+  const uint L = dims_input.y;
+  const uint M = dims_output.x;
+  const uint view_cols = dims_output.y;
+
+  const long iL = lhs_sel[k];
+  const long iR = rhs_sel[k];
+
+  if (col < view_cols) {
+    const ulong offL = (ulong)iL * (ulong)view_cols + (ulong)col;
+    const ulong offR = (ulong)iR * (ulong)view_cols + (ulong)col;
+    const ulong offO = (ulong)k  * (ulong)view_cols + (ulong)col;
+
+    const float a = (float)lhs_vals[offL];
+    const float b = (float)rhs_vals[offR];
+    out_vals[offO] = (T)(a * b);
+  }
+
+  // One thread per match copies the indices column
+  if (col == 0) {
+    const ulong uL = (ulong)L;
+    const ulong uM = (ulong)M;
+    const ulong src_col = (ulong)iL; // gather from lhs
+    for (uint d = 0; d < n_dim_i; ++d) {
+      const long v = lhs_indices[(ulong)d * uL + src_col];
+      out_indices[(ulong)d * uM + (ulong)k] = v;
+    }
+  }
+}
+
+#define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE)                                 \
+  template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void     \
+  dense_sparse_mul_kernel<DTYPE>(                                           \
+      device const DTYPE* dense         [[buffer(0)]],                      \
+      device const DTYPE* values        [[buffer(1)]],                      \
+      device DTYPE* out_values          [[buffer(2)]],                      \
+      device const long* indices        [[buffer(3)]],                      \
+      device const long* sizes          [[buffer(4)]],                      \
+      constant uint3& sparse_params     [[buffer(5)]],                      \
+      uint3 gid                         [[thread_position_in_grid]]);
+
+INSTANTIATE_DENSE_SPARSE_MUL(float);
+INSTANTIATE_DENSE_SPARSE_MUL(half);
+INSTANTIATE_DENSE_SPARSE_MUL(bfloat);
+
+#define INSTANTIATE_FUSED_GATHER_MUL(DTYPE)                                  \
+  template [[host_name("fused_gather_mul_kernel_" #DTYPE)]] kernel void      \
+  fused_gather_mul_kernel<DTYPE>(                                            \
+      device const DTYPE* lhs_vals      [[buffer(0)]],                       \
+      device const DTYPE* rhs_vals      [[buffer(1)]],                       \
+      device const long*  lhs_sel       [[buffer(2)]],                       \
+      device const long*  rhs_sel       [[buffer(3)]],                       \
+      device const long*  lhs_indices   [[buffer(4)]],                       \
+      device long*        out_indices   [[buffer(5)]],                       \
+      device DTYPE*       out_vals      [[buffer(6)]],                       \
+      constant uint2&     dims_input    [[buffer(7)]],                       \
+      constant uint2&     dims_output   [[buffer(8)]],                       \
+      uint3               gid           [[thread_position_in_grid]]);
+
+INSTANTIATE_FUSED_GATHER_MUL(float);
+INSTANTIATE_FUSED_GATHER_MUL(half);
+INSTANTIATE_FUSED_GATHER_MUL(bfloat);
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@ -72,6 +72,12 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "timm_vovnet",
                "torchrec_dlrm",
                "vgg16",
+                # LLM
+                "meta-llama/Llama-3.2-1B",
+                "google/gemma-2-2b",
+                "google/gemma-3-4b-it",
+                "openai/whisper-tiny",
+                "Qwen/Qwen3-0.6B",
            }
        )

--- a/benchmarks/dynamo/check_graph_breaks.py
+++ b/benchmarks/dynamo/check_graph_breaks.py
@ -55,6 +55,12 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                "timm_nfnet",
                "torchrec_dlrm",
                "vgg16",
+                # LLM
+                "meta-llama/Llama-3.2-1B",
+                "google/gemma-2-2b",
+                "google/gemma-3-4b-it",
+                "openai/whisper-tiny",
+                "Qwen/Qwen3-0.6B",
            }
        )

--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@ -167,3 +167,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,fail_accuracy,0
+
+
+
+google/gemma-2-2b,fail_accuracy,0
+
+
+
+google/gemma-3-4b-it,fail_accuracy,0
+
+
+
+openai/whisper-tiny,fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,fail_accuracy,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass_due_to_skip,0
+
+
+
+google/gemma-2-2b,pass_due_to_skip,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass_due_to_skip,0
+
+
+
+Qwen/Qwen3-0.6B,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@ -98,11 +98,11 @@ dlrm,pass,0



-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3



-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass_due_to_skip,0
+
+
+
+google/gemma-2-2b,pass_due_to_skip,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass_due_to_skip,0
+
+
+
+Qwen/Qwen3-0.6B,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@ -98,11 +98,11 @@ dlrm,pass,0



-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3



-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass_due_to_skip,0
+
+
+
+google/gemma-2-2b,pass_due_to_skip,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass_due_to_skip,0
+
+
+
+Qwen/Qwen3-0.6B,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@ -98,11 +98,11 @@ dlrm,pass,0



-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3



-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,0
+
+
+
+google/gemma-2-2b,pass,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,0
+
+
+
+Qwen/Qwen3-0.6B,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@ -82,11 +82,11 @@ dlrm,pass,0



-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3



-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
@ -98,11 +98,11 @@ dlrm,pass,0



-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3



-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/Show More
+++ b/Show More