Halves time spent in generating the key strings

Adding a direct MPS kernel path to linear op and MPS kernel caching mechanism for improved perf.
2025-10-29 11:14:56 +08:00 · 2025-04-22 11:32:36 -07:00 · 2025-04-22 11:32:34 -07:00
3226 changed files with 41051 additions and 118271 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -27,7 +27,6 @@ if [ "$DESIRED_CUDA" = "cpu" ]; then
    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
    export USE_SYSTEM_NCCL=1
    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -31,47 +31,33 @@ def build_ArmComputeLibrary() -> None:
        "build=native",
    ]
    acl_install_dir = "/acl"
-    acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
+    acl_checkout_dir = "ComputeLibrary"
-    if os.path.isdir(acl_install_dir):
+    os.makedirs(acl_install_dir)
-        shutil.rmtree(acl_install_dir)
+    check_call(
-    if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
+        [
-        check_call(
+            "git",
-            [
+            "clone",
-                "git",
+            "https://github.com/ARM-software/ComputeLibrary.git",
-                "clone",
+            "-b",
-                "https://github.com/ARM-software/ComputeLibrary.git",
+            "v25.02",
-                "-b",
+            "--depth",
-                "v25.02",
+            "1",
-                "--depth",
+            "--shallow-submodules",
-                "1",
+        ]
-                "--shallow-submodules",
+    )
            ]
        )
    check_call(
-        ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
+        ["scons", "Werror=1", "-j8", f"build_dir=/{acl_install_dir}/build"]
        + acl_build_flags,
        cwd=acl_checkout_dir,
    )
-    for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
+    for d in ["arm_compute", "include", "utils", "support", "src"]:
        shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
-def replace_tag(filename) -> None:
+def update_wheel(wheel_path, desired_cuda) -> None:
    with open(filename) as f:
        lines = f.readlines()
    for i, line in enumerate(lines):
        if line.startswith("Tag:"):
            lines[i] = line.replace("-linux_", "-manylinux_2_28_")
            print(f"Updated tag from {line} to {lines[i]}")
            break
    with open(filename, "w") as f:
        f.writelines(lines)
 def package_cuda_wheel(wheel_path, desired_cuda) -> None:
    """
-    Package the cuda wheel libraries
+    Update the cuda wheel libraries
    """
    folder = os.path.dirname(wheel_path)
    wheelname = os.path.basename(wheel_path)
@ -102,19 +88,30 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        "/usr/lib64/libgfortran.so.5",
        "/acl/build/libarm_compute.so",
        "/acl/build/libarm_compute_graph.so",
        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
        "/usr/local/lib/libnvpl_lapack_core.so.0",
        "/usr/local/lib/libnvpl_blas_core.so.0",
    ]
-
+    if enable_cuda:
    if "128" in desired_cuda:
        libs_to_copy += [
-            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-            "/usr/local/cuda/lib64/libcufile.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
            "/usr/local/lib/libnvpl_blas_core.so.0",
        ]
        if "126" in desired_cuda:
            libs_to_copy += [
                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.6",
                "/usr/local/cuda/lib64/libcufile.so.0",
                "/usr/local/cuda/lib64/libcufile_rdma.so.1",
            ]
        elif "128" in desired_cuda:
            libs_to_copy += [
                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
                "/usr/local/cuda/lib64/libcufile.so.0",
                "/usr/local/cuda/lib64/libcufile_rdma.so.1",
            ]
    else:
        libs_to_copy += [
            "/opt/OpenBLAS/lib/libopenblas.so.0",
        ]
    # Copy libraries to unzipped_folder/a/lib
    for lib_path in libs_to_copy:
        lib_name = os.path.basename(lib_path)
@ -123,13 +120,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
            f"cd {folder}/tmp/torch/lib/; "
            f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
        )
    # Make sure the wheel is tagged with manylinux_2_28
    for f in os.scandir(f"{folder}/tmp/"):
        if f.is_dir() and f.name.endswith(".dist-info"):
            replace_tag(f"{f.path}/WHEEL")
            break
    os.mkdir(f"{folder}/cuda_wheel")
    os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
    shutil.move(
@ -204,10 +194,8 @@ if __name__ == "__main__":
    ).decode()
    print("Building PyTorch wheel")
-    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
+    build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
-    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
+    os.system("cd /pytorch; python setup.py clean")
    if enable_cuda:
        build_vars = "MAX_JOBS=5 " + build_vars
    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
    desired_cuda = os.getenv("DESIRED_CUDA")
@ -254,6 +242,6 @@ if __name__ == "__main__":
        print("Updating Cuda Dependency")
        filename = os.listdir("/pytorch/dist/")
        wheel_path = f"/pytorch/dist/{filename[0]}"
-        package_cuda_wheel(wheel_path, desired_cuda)
+        update_wheel(wheel_path, desired_cuda)
    pytorch_wheel_name = complete_wheel("/pytorch/")
    print(f"Build Complete. Created {pytorch_wheel_name}..")
--- a/.ci/caffe2/README.md
+++ b/.ci/caffe2/README.md
@ -10,3 +10,5 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
 built on Jenkins and are used in triggered builds already have this
 environment variable set in their manifest. Also see
 `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
 Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
--- a/.ci/caffe2/test.sh
+++ b/.ci/caffe2/test.sh
@ -13,6 +13,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
  echo 'Skipping tests'
  exit 0
 fi
 if [[ "${BUILD_ENVIRONMENT}" == *-rocm* ]]; then
  # temporary to locate some kernel issues on the CI nodes
  export HSAKMT_DEBUG_LEVEL=4
 fi
 # These additional packages are needed for circleci ROCm builds.
 if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
    # Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@ -34,5 +34,5 @@ See `build.sh` for valid build environments (it's the giant switch).
 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
 # Set flags (see build.sh) and build image
-sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
+sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
 ```
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -1,7 +1,6 @@
 ARG CUDA_VERSION=12.4
 ARG BASE_TARGET=cuda${CUDA_VERSION}
-ARG ROCM_IMAGE=rocm/dev-almalinux-8:6.3-complete
+FROM amd64/almalinux:8 as base
 FROM amd64/almalinux:8.10-20250519 as base
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
@ -9,10 +8,12 @@ ENV LANGUAGE en_US.UTF-8
 ARG DEVTOOLSET_VERSION=11
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 RUN yum -y update
 RUN yum -y install epel-release
 # install glibc-langpack-en make sure en_US.UTF-8 locale is available
 RUN yum -y install glibc-langpack-en
 RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
@ -40,12 +41,11 @@ RUN bash ./install_conda.sh && rm install_conda.sh
 # Install CUDA
 FROM base as cuda
-ARG CUDA_VERSION=12.6
+ARG CUDA_VERSION=12.4
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
 COPY ./common/install_nccl.sh install_nccl.sh
 COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
 COPY ./common/install_cusparselt.sh install_cusparselt.sh
 ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
 # Preserve CUDA_VERSION for the builds
 ENV CUDA_VERSION=${CUDA_VERSION}
@ -56,20 +56,18 @@ FROM cuda as cuda11.8
 RUN bash ./install_cuda.sh 11.8
 ENV DESIRED_CUDA=11.8
 FROM cuda as cuda12.1
 RUN bash ./install_cuda.sh 12.1
 ENV DESIRED_CUDA=12.1
 FROM cuda as cuda12.4
 RUN bash ./install_cuda.sh 12.4
 ENV DESIRED_CUDA=12.4
 FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
 ENV DESIRED_CUDA=12.6
 FROM cuda as cuda12.8
 RUN bash ./install_cuda.sh 12.8
 ENV DESIRED_CUDA=12.8
 FROM ${ROCM_IMAGE} as rocm
 ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 ENV MKLROOT /opt/intel
 # Install MNIST test data
 FROM base as mnist
 ADD ./common/install_mnist.sh install_mnist.sh
@ -77,8 +75,9 @@ RUN bash ./install_mnist.sh
 FROM base as all_cuda
 COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
 COPY --from=cuda12.1  /usr/local/cuda-12.1 /usr/local/cuda-12.1
 COPY --from=cuda12.4  /usr/local/cuda-12.4 /usr/local/cuda-12.4
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
 COPY --from=cuda12.4  /usr/local/cuda-12.8 /usr/local/cuda-12.8
 # Final step
 FROM ${BASE_TARGET} as final
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -15,16 +15,9 @@ fi
 DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
 CUDA_VERSION=""
 ROCM_VERSION=""
 EXTRA_BUILD_ARGS=""
 if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
    # extract cuda version from image name and tag.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
    CUDA_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
    EXTRA_BUILD_ARGS="--build-arg CUDA_VERSION=${CUDA_VERSION}"
 elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
    # extract rocm version from image name and tag.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
    ROCM_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
    EXTRA_BUILD_ARGS="--build-arg ROCM_IMAGE=rocm/dev-almalinux-8:${ROCM_VERSION}-complete"
 fi
 case ${DOCKER_TAG_PREFIX} in
@ -34,9 +27,6 @@ case ${DOCKER_TAG_PREFIX} in
  cuda*)
    BASE_TARGET=cuda${CUDA_VERSION}
    ;;
  rocm*)
    BASE_TARGET=rocm
    ;;
  *)
    echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
    exit 1
@ -57,8 +47,8 @@ docker build \
  --target final \
  --progress plain \
  --build-arg "BASE_TARGET=${BASE_TARGET}" \
  --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
  --build-arg "DEVTOOLSET_VERSION=11" \
  ${EXTRA_BUILD_ARGS} \
  -t ${tmp_tag} \
  $@ \
  -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -85,6 +85,9 @@ elif [[ "$image" == *linter* ]]; then
  DOCKERFILE="linter/Dockerfile"
 fi
 # CMake 3.18 is needed to support CUDA17 language variant
 CMAKE_VERSION=3.18.5
 _UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
 _UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 if [[ "$image" == *rocm* ]]; then
@ -92,32 +95,62 @@ if [[ "$image" == *rocm* ]]; then
  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
 fi
 tag=$(echo $image | awk -F':' '{print $2}')
 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
 # from scratch
-case "$tag" in
+case "$image" in
  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11)
    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
+  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3.13-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
@ -126,161 +159,196 @@ case "$tag" in
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
+  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
+    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
+  pytorch-linux-focal-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
+    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
+  pytorch-linux-focal-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
+    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-focal-cuda12.8-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.8.1
+    CUDA_VERSION=11.8.0
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-py3-clang10-onnx)
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=10
    PROTOBUF=yes
    VISION=yes
    CONDA_CMAKE=yes
    ONNX=yes
    ;;
  pytorch-linux-focal-py3.9-clang10)
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=10
    PROTOBUF=yes
    VISION=yes
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-py3.11-clang10)
    ANACONDA_PYTHON_VERSION=3.11
    CLANG_VERSION=10
    PROTOBUF=yes
    VISION=yes
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-py3.9-gcc9)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=9
    PROTOBUF=yes
    VISION=yes
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-rocm-n-1-py3)
+  pytorch-linux-focal-rocm-n-1-py3)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    PROTOBUF=yes
    VISION=yes
    ROCM_VERSION=6.2.4
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-rocm-n-py3)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    PROTOBUF=yes
    VISION=yes
    ROCM_VERSION=6.3
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-rocm-n-py3)
+  pytorch-linux-jammy-xpu-2024.0-py3)
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
    VISION=yes
-    ROCM_VERSION=6.4
+    XPU_VERSION=0.5
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-jammy-xpu-2025.0-py3)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
    VISION=yes
    XPU_VERSION=2025.0
    NINJA_VERSION=1.9.0
-    TRITON=yes
+    CONDA_CMAKE=yes
    ;;
  pytorch-linux-jammy-xpu-2025.1-py3)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    VISION=yes
    XPU_VERSION=2025.1
    NINJA_VERSION=1.9.0
    TRITON=yes
    ;;
    pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    CONDA_CMAKE=yes
    TRITON=yes
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
+  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-clang12)
    ANACONDA_PYTHON_VERSION=3.9
-    CUDA_VERSION=12.8.1
+    CUDA_VERSION=11.8
    CUDNN_VERSION=9
    CLANG_VERSION=12
    PROTOBUF=yes
    VISION=yes
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-clang12-asan)
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
    PROTOBUF=yes
    VISION=yes
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-clang15-asan)
    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=15
    CONDA_CMAKE=yes
    VISION=yes
    ;;
  pytorch-linux-jammy-py3-clang18-asan)
    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=18
    CONDA_CMAKE=yes
    VISION=yes
    ;;
  pytorch-linux-jammy-py3.9-gcc11)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    CONDA_CMAKE=yes
    TRITON=yes
    DOCS=yes
    UNINSTALL_DILL=yes
@ -288,12 +356,14 @@ case "$tag" in
  pytorch-linux-jammy-py3-clang12-executorch)
    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=12
    CONDA_CMAKE=yes
    EXECUTORCH=yes
    ;;
  pytorch-linux-jammy-py3.12-halide)
    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
    CONDA_CMAKE=yes
    HALIDE=yes
    TRITON=yes
    ;;
@ -301,6 +371,7 @@ case "$tag" in
    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
    CONDA_CMAKE=yes
    TRITON_CPU=yes
    ;;
  pytorch-linux-focal-linter)
@ -308,16 +379,20 @@ case "$tag" in
    # We will need to update mypy version eventually, but that's for another day. The task
    # would be to upgrade mypy to 1.0.0 with Python 3.11
    PYTHON_VERSION=3.9
    PIP_CMAKE=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
+  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
    PYTHON_VERSION=3.9
-    CUDA_VERSION=12.8.1
+    CUDA_VERSION=11.8
    PIP_CMAKE=yes
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    ACL=yes
    PROTOBUF=yes
    VISION=yes
    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -326,7 +401,9 @@ case "$tag" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    ACL=yes
    PROTOBUF=yes
    VISION=yes
    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -334,6 +411,7 @@ case "$tag" in
    ;;
  *)
    # Catch-all for builds that are not hardcoded.
    PROTOBUF=yes
    VISION=yes
    echo "image '$image' did not match an existing build configuration"
    if [[ "$image" == *py* ]]; then
@ -349,7 +427,8 @@ case "$tag" in
      TRITON=yes
      # To ensure that any ROCm config will build using conda cmake
      # and thus have LAPACK/MKL enabled
-      fi
+      CONDA_CMAKE=yes
    fi
    if [[ "$image" == *centos7* ]]; then
      NINJA_VERSION=1.10.2
    fi
@ -365,6 +444,9 @@ case "$tag" in
    if [[ "$image" == *glibc* ]]; then
      extract_version_from_image_name glibc GLIBC_VERSION
    fi
    if [[ "$image" == *cmake* ]]; then
      extract_version_from_image_name cmake CMAKE_VERSION
    fi
  ;;
 esac
@ -391,6 +473,7 @@ docker build \
       ${no_cache_flag} \
       ${progress_flag} \
       --build-arg "BUILD_ENVIRONMENT=${image}" \
       --build-arg "PROTOBUF=${PROTOBUF:-}" \
       --build-arg "LLVMDEV=${LLVMDEV:-}" \
       --build-arg "VISION=${VISION:-}" \
       --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
@ -405,6 +488,7 @@ docker build \
       --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
       --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
       --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
       --build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
@ -412,6 +496,8 @@ docker build \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
       --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
       --build-arg "PIP_CMAKE=${PIP_CMAKE}" \
       --build-arg "TRITON=${TRITON}" \
       --build-arg "TRITON_CPU=${TRITON_CPU}" \
       --build-arg "ONNX=${ONNX}" \
@ -420,7 +506,6 @@ docker build \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
       --build-arg "HALIDE=${HALIDE}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
       --build-arg "ACL=${ACL:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
@ -497,12 +582,3 @@ elif [ "$HAS_TRITON" = "yes" ]; then
  echo "expecting triton to not be installed, but it is"
  exit 1
 fi
 # Sanity check cmake version.  Executorch reinstalls cmake and I'm not sure if
 # they support 4.0.0 yet, so exclude them from this check.
 CMAKE_VERSION=$(drun cmake --version)
 if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
  echo "CMake version is not 4.0.0:"
  drun cmake --version
  exit 1
 fi
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -17,8 +17,9 @@ RUN bash ./install_base.sh && rm install_base.sh
 # Update CentOS git version
 RUN yum -y remove git
 RUN yum -y remove git-*
-RUN yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm && \
+RUN yum -y install https://packages.endpoint.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm || \
-    sed -i 's/packages.endpoint/packages.endpointdev/' /etc/yum.repos.d/endpoint.repo
+    (yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm && \
    sed -i "s/packages.endpoint/packages.endpointdev/" /etc/yum.repos.d/endpoint.repo)
 RUN yum install -y git
 # Install devtoolset
@ -39,6 +40,7 @@ RUN bash ./install_user.sh && rm install_user.sh
 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
 ARG CONDA_CMAKE
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
@ -46,6 +48,13 @@ COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
 # (optional) Install protobuf for ONNX
 ARG PROTOBUF
 COPY ./common/install_protobuf.sh install_protobuf.sh
 RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -73,6 +82,12 @@ ENV MAGMA_HOME /opt/rocm/magma
 ENV LANG en_US.utf8
 ENV LC_ALL en_US.utf8
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
 RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
 RUN rm install_cmake.sh
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-b173722085b3f555d6ba4533d6bbaddfd7c71144
+381ae5d57d35c165d98df728380b20fbde350392
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@ -1 +1 @@
-v2.26.5-1
+v2.26.2-1
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-b0e26b7359c147b8aa0af686c20510fb9b15990a
+0bcc8265e677e5321606a3311bf71470f14456a8
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-c8757738a7418249896224430ce84888e8ecdd79
+96316ce50fade7e209553aba4898cd9b82aab83b
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -99,6 +99,9 @@ install_centos() {
  ccache_deps="asciidoc docbook-dtds docbook-style-xsl libxslt"
  numpy_deps="gcc-gfortran"
  # Note: protobuf-c-{compiler,devel} on CentOS are too old to be used
  # for Caffe2. That said, we still install them to make sure the build
  # system opts to build/use protoc and libprotobuf from third-party.
  yum install -y \
    $ccache_deps \
    $numpy_deps \
--- a/.ci/docker/common/install_cache.sh
+++ b/.ci/docker/common/install_cache.sh
@ -9,7 +9,7 @@ install_ubuntu() {
  # Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh``
  apt-get install -y cargo
  echo "Checking out sccache repo"
-  git clone https://github.com/mozilla/sccache -b v0.10.0
+  git clone https://github.com/mozilla/sccache -b v0.9.1
  cd sccache
  echo "Building sccache"
  cargo build --release
--- a/.ci/docker/common/install_cmake.sh
+++ b/.ci/docker/common/install_cmake.sh
@ -0,0 +1,31 @@
 #!/bin/bash
 set -ex
 [ -n "$CMAKE_VERSION" ]
 # Remove system cmake install so it won't get used instead
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 case "$ID" in
  ubuntu)
    apt-get remove cmake -y
    ;;
  centos)
    yum remove cmake -y
    ;;
  *)
    echo "Unable to determine OS..."
    exit 1
    ;;
 esac
 # Turn 3.6.3 into v3.6
 path=$(echo "${CMAKE_VERSION}" | sed -e 's/\([0-9].[0-9]\+\).*/v\1/')
 file="cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz"
 # Download and install specific CMake version in /usr/local
 pushd /tmp
 curl -Os --retry 3 "https://cmake.org/files/${path}/${file}"
 tar -C /usr/local --strip-components 1 --no-same-owner -zxf cmake-*.tar.gz
 rm -f cmake-*.tar.gz
 popd
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -7,7 +7,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  BASE_URL="https://repo.anaconda.com/miniconda"
  CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
  if [[ $(uname -m) == "aarch64" ]] || [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
-    BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"  # @lint-ignore
+    BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"
    CONDA_FILE="Miniforge3-Linux-$(uname -m).sh"
  fi
@ -75,11 +75,19 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  # and libpython-static for torch deploy
  conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}"
  # Use conda cmake in some cases. Conda cmake will be newer than our supported
  # min version (3.5 for xenial and 3.10 for bionic), so we only do it in those
  # following builds that we know should use conda. Specifically, Ubuntu bionic
  # and focal cannot find conda mkl with stock cmake, so we need a cmake from conda
  if [ -n "${CONDA_CMAKE}" ]; then
    conda_install cmake
  fi
  # Magma package names are concatenation of CUDA major and minor ignoring revision
  # I.e. magma-cuda102 package corresponds to CUDA_VERSION=10.2 and CUDA_VERSION=10.2.89
  # Magma is installed from a tarball in the ossci-linux bucket into the conda env
  if [ -n "$CUDA_VERSION" ]; then
-    conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION})
+    ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION}) ${ANACONDA_PYTHON_VERSION}
  fi
  # Install some other packages, including those needed for Python test reporting
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -3,7 +3,7 @@
 set -uex -o pipefail
 PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
-PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads  # @lint-ignore
+PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads
 GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
 # Python versions to be installed in /opt/$VERSION_NO
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -2,54 +2,64 @@
 set -ex
-arch_path=''
+CUDNN_VERSION=9.5.1.17
 targetarch=${TARGETARCH:-$(uname -m)}
 if [ ${targetarch} = 'amd64' ] || [ "${targetarch}" = 'x86_64' ]; then
  arch_path='x86_64'
 else
  arch_path='sbsa'
 fi
-function install_cuda {
+function install_cusparselt_040 {
-  version=$1
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
-  runfile=$2
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
-  major_minor=${version%.*}
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
-  rm -rf /usr/local/cuda-${major_minor} /usr/local/cuda
+    tar xf libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
-  if [[ ${arch_path} == 'sbsa' ]]; then
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/include/* /usr/local/cuda/include/
-      runfile="${runfile}_sbsa"
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/lib/* /usr/local/cuda/lib64/
-  fi
+    popd
-  runfile="${runfile}.run"
+    rm -rf tmp_cusparselt
  wget -q https://developer.download.nvidia.com/compute/cuda/${version}/local_installers/${runfile} -O ${runfile}
  chmod +x ${runfile}
  ./${runfile} --toolkit --silent
  rm -f ${runfile}
  rm -f /usr/local/cuda && ln -s /usr/local/cuda-${major_minor} /usr/local/cuda
 }
-function install_cudnn {
+function install_cusparselt_062 {
-  cuda_major_version=$1
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
-  cudnn_version=$2
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
-  mkdir tmp_cudnn && cd tmp_cudnn
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    tar xf libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
-  filepath="cudnn-linux-${arch_path}-${cudnn_version}_cuda${cuda_major_version}-archive"
+    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/include/* /usr/local/cuda/include/
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-${arch_path}/${filepath}.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
-  tar xf ${filepath}.tar.xz
+    popd
-  cp -a ${filepath}/include/* /usr/local/cuda/include/
+    rm -rf tmp_cusparselt
-  cp -a ${filepath}/lib/* /usr/local/cuda/lib64/
+}
-  cd ..
+
-  rm -rf tmp_cudnn
+function install_cusparselt_063 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
    mkdir tmp_cusparselt && pushd tmp_cusparselt
    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
    tar xf libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
    cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/include/* /usr/local/cuda/include/
    cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
    popd
    rm -rf tmp_cusparselt
 }
 function install_118 {
    CUDNN_VERSION=9.1.0.70
    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.4.0"
-    install_cuda 11.8.0 cuda_11.8.0_520.61.05_linux
+    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
    # install CUDA 11.8.0 in the same container
    wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
    chmod +x cuda_11.8.0_520.61.05_linux.run
    ./cuda_11.8.0_520.61.05_linux.run --toolkit --silent
    rm -f cuda_11.8.0_520.61.05_linux.run
    rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.8 /usr/local/cuda
-    install_cudnn 11 $CUDNN_VERSION
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
    mkdir tmp_cudnn && cd tmp_cudnn
    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/include/* /usr/local/cuda/include/
    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/lib/* /usr/local/cuda/lib64/
    cd ..
    rm -rf tmp_cudnn
    CUDA_VERSION=11.8 bash install_nccl.sh
-    CUDA_VERSION=11.8 bash install_cusparselt.sh
+    install_cusparselt_040
    ldconfig
 }
@ -57,27 +67,52 @@ function install_118 {
 function install_124 {
  CUDNN_VERSION=9.1.0.70
  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.2"
-  install_cuda 12.4.1 cuda_12.4.1_550.54.15_linux
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
  chmod +x cuda_12.4.1_550.54.15_linux.run
  ./cuda_12.4.1_550.54.15_linux.run --toolkit --silent
  rm -f cuda_12.4.1_550.54.15_linux.run
  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
-  install_cudnn 12 $CUDNN_VERSION
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  mkdir tmp_cudnn && cd tmp_cudnn
  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf tmp_cudnn
  CUDA_VERSION=12.4 bash install_nccl.sh
-  CUDA_VERSION=12.4 bash install_cusparselt.sh
+  install_cusparselt_062
  ldconfig
 }
 function install_126 {
  CUDNN_VERSION=9.5.1.17
  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
-  install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux
+  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
  # install CUDA 12.6.3 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
  chmod +x cuda_12.6.3_560.35.05_linux.run
  ./cuda_12.6.3_560.35.05_linux.run --toolkit --silent
  rm -f cuda_12.6.3_560.35.05_linux.run
  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
-  install_cudnn 12 $CUDNN_VERSION
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  mkdir tmp_cudnn && cd tmp_cudnn
  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf tmp_cudnn
  CUDA_VERSION=12.6 bash install_nccl.sh
-  CUDA_VERSION=12.6 bash install_cusparselt.sh
+  install_cusparselt_063
  ldconfig
 }
@ -183,16 +218,27 @@ function prune_126 {
 function install_128 {
  CUDNN_VERSION=9.8.0.87
-  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
+  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
-  # install CUDA 12.8.1 in the same container
+  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
-  install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
+  # install CUDA 12.8.0 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run
  chmod +x cuda_12.8.0_570.86.10_linux.run
  ./cuda_12.8.0_570.86.10_linux.run --toolkit --silent
  rm -f cuda_12.8.0_570.86.10_linux.run
  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.8 /usr/local/cuda
  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  install_cudnn 12 $CUDNN_VERSION
+  mkdir tmp_cudnn && cd tmp_cudnn
  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf tmp_cudnn
  CUDA_VERSION=12.8 bash install_nccl.sh
-  CUDA_VERSION=12.8 bash install_cusparselt.sh
+  install_cusparselt_063
  ldconfig
 }
@ -205,9 +251,9 @@ do
        ;;
    12.4) install_124; prune_124
        ;;
-    12.6|12.6.*) install_126; prune_126
+    12.6) install_126; prune_126
        ;;
-    12.8|12.8.*) install_128;
+    12.8) install_128;
        ;;
    *) echo "bad argument $1"; exit 1
        ;;
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -0,0 +1,55 @@
 #!/bin/bash
 # Script used only in CD pipeline
 set -ex
 CUDNN_VERSION=9.8.0.87
 function install_cusparselt_063 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
    mkdir tmp_cusparselt && pushd tmp_cusparselt
    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.3.2-archive.tar.xz
    tar xf libcusparse_lt-linux-sbsa-0.6.3.2-archive.tar.xz
    cp -a libcusparse_lt-linux-sbsa-0.6.3.2-archive/include/* /usr/local/cuda/include/
    cp -a libcusparse_lt-linux-sbsa-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
    popd
    rm -rf tmp_cusparselt
 }
 function install_128 {
  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
  # install CUDA 12.8.0 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux_sbsa.run
  chmod +x cuda_12.8.0_570.86.10_linux_sbsa.run
  ./cuda_12.8.0_570.86.10_linux_sbsa.run --toolkit --silent
  rm -f cuda_12.8.0_570.86.10_linux_sbsa.run
  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.8 /usr/local/cuda
  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  mkdir tmp_cudnn && cd tmp_cudnn
  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf tmp_cudnn
  CUDA_VERSION=12.8 bash install_nccl.sh
  install_cusparselt_063
  ldconfig
 }
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
    case "$1" in
    12.8) install_128;
        ;;
    *) echo "bad argument $1"; exit 1
        ;;
    esac
    shift
 done
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -13,7 +13,7 @@ clone_executorch() {
  # and fetch the target commit
  pushd executorch
  git checkout "${EXECUTORCH_PINNED_COMMIT}"
-  git submodule update --init --recursive
+  git submodule update --init
  popd
  chown -R jenkins executorch
--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@ -17,7 +17,7 @@ if [ -n "${UBUNTU_VERSION}" ];then
                  libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
 fi
-pip_install numpy scipy imageio cmake ninja
+conda_install numpy scipy imageio cmake ninja
 git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
 cmake -DCMAKE_BUILD_TYPE=Release \
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -14,9 +14,16 @@ function install_timm() {
  local commit
  commit=$(get_pinned_commit timm)
  # TODO (huydhn): There is no torchvision release on 3.13 when I write this, so
  # I'm using nightly here instead. We just need to package to be able to install
  # TIMM. Removing this once vision has a release on 3.13
  if [[ "${ANACONDA_PYTHON_VERSION}" == "3.13" ]]; then
    pip_install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
  fi
  pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
  # Clean up
-  conda_run pip uninstall -y torch torchvision triton
+  conda_run pip uninstall -y cmake torch torchvision triton
 }
 # Pango is needed for weasyprint which is needed for doctr
--- a/.ci/docker/common/install_magma_conda.sh
+++ b/.ci/docker/common/install_magma_conda.sh
@ -1,23 +1,26 @@
 #!/usr/bin/env bash
-# Script that installs magma from tarball inside conda environment.
+# Script that replaces the magma install from a conda package
 # It replaces anaconda magma-cuda package which is no longer published.
 # Execute it inside active conda environment.
 # See issue: https://github.com/pytorch/pytorch/issues/138506
 set -eou pipefail
-cuda_version_nodot=${1/./}
+function do_install() {
-anaconda_dir=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+    cuda_version_nodot=${1/./}
    anaconda_python_version=$2
-MAGMA_VERSION="2.6.1"
+    MAGMA_VERSION="2.6.1"
-magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+    magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
-(
+
-    set -x
+    anaconda_dir="/opt/conda/envs/py_${anaconda_python_version}"
-    tmp_dir=$(mktemp -d)
+    (
-    pushd ${tmp_dir}
+        set -x
-    curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
+        tmp_dir=$(mktemp -d)
-    tar -xvf "${magma_archive}"
+        pushd ${tmp_dir}
-    mv include/* "${anaconda_dir}/include/"
+        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
-    mv lib/* "${anaconda_dir}/lib"
+        tar -xvf "${magma_archive}"
-    popd
+        mv include/* "${anaconda_dir}/include/"
-)
+        mv lib/* "${anaconda_dir}/lib"
        popd
    )
 }
 do_install $1 $2
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -8,6 +8,16 @@ retry () {
    "$@" || (sleep 10 && "$@") || (sleep 20 && "$@") || (sleep 40 && "$@")
 }
 # A bunch of custom pip dependencies for ONNX
 pip_install \
  beartype==0.15.0 \
  filelock==3.9.0 \
  flatbuffers==2.0 \
  mock==5.0.1 \
  ninja==1.10.2 \
  networkx==2.5 \
  numpy==1.24.2
 # ONNXRuntime should be installed before installing
 # onnx-weekly. Otherwise, onnx-weekly could be
 # overwritten by onnx.
@ -19,8 +29,12 @@ pip_install \
  transformers==4.36.2
 pip_install coloredlogs packaging
 pip_install onnxruntime==1.18.1
-pip_install onnxscript==0.3.0
+pip_install onnx==1.17.0
 pip_install onnxscript==0.2.2 --no-deps
 # required by onnxscript
 pip_install ml_dtypes
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -4,7 +4,8 @@
 set -ex
 cd /
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.29}" --depth 1 --shallow-submodules
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.29 --depth 1 --shallow-submodules
 OPENBLAS_BUILD_FLAGS="
 NUM_THREADS=128
--- a/.ci/docker/common/install_protobuf.sh
+++ b/.ci/docker/common/install_protobuf.sh
@ -0,0 +1,19 @@
 #!/bin/bash
 set -ex
 pb_dir="/usr/temp_pb_install_dir"
 mkdir -p $pb_dir
 # On the nvidia/cuda:9-cudnn7-devel-centos7 image we need this symlink or
 # else it will fail with
 #   g++: error: ./../lib64/crti.o: No such file or directory
 ln -s /usr/lib64 "$pb_dir/lib64"
 curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
 tar -xvz --no-same-owner -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
 NPROC=$[$(nproc) - 2]
 pushd "$pb_dir" && ./configure && make -j${NPROC} && make -j${NPROC} check && sudo make -j${NRPOC} install && sudo ldconfig
 popd
 rm -rf $pb_dir
--- a/.ci/docker/common/install_python.sh
+++ b/.ci/docker/common/install_python.sh
@ -13,3 +13,6 @@ source /var/lib/jenkins/ci_env/bin/activate
 python -mpip install --upgrade pip
 python -mpip install -r /opt/requirements-ci.txt
 if [ -n "${PIP_CMAKE}" ]; then
  python -mpip install cmake==3.31.6
 fi
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -66,25 +66,17 @@ EOF
    done
    # ROCm 6.3 had a regression where initializing static code objects had significant overhead
-    # ROCm 6.4 did not yet fix the regression, also HIP branch names are different
+    if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
    if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]] || [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
        if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
            HIP_BRANCH=rocm-6.3.x
            VER_STR=6.3
        elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
            HIP_BRANCH=release/rocm-rel-6.4
            VER_STR=6.4
        fi
        # clr build needs CppHeaderParser but can only find it using conda's python
        /opt/conda/bin/python -m pip install CppHeaderParser
-        git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
+        git clone https://github.com/ROCm/HIP -b rocm-6.3.x
        HIP_COMMON_DIR=$(readlink -f HIP)
-        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}-statco-hotfix
+        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-6.3-statco-hotfix
        mkdir -p clr/build
        pushd clr/build
        cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
        make -j
-        cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
+        cp hipamd/lib/libamdhip64.so.6.3.* /opt/rocm/lib/libamdhip64.so.6.3.*
        popd
        rm -rf HIP clr
    fi
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -10,8 +10,12 @@ fi
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-get_pip_version() {
+get_conda_version() {
-  conda_run pip list | grep -w $* | head -n 1 | awk '{print $2}'
+  as_jenkins conda list -n py_$ANACONDA_PYTHON_VERSION | grep -w $* | head -n 1 | awk '{print $2}'
 }
 conda_reinstall() {
  as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
 }
 if [ -n "${XPU_VERSION}" ]; then
@ -33,9 +37,11 @@ if [ -n "${UBUNTU_VERSION}" ];then
    apt-get install -y gpg-agent
 fi
-# Keep the current cmake and numpy version here, so we can reinstall them later
+if [ -n "${CONDA_CMAKE}" ]; then
-CMAKE_VERSION=$(get_pip_version cmake)
+  # Keep the current cmake and numpy version here, so we can reinstall them later
-NUMPY_VERSION=$(get_pip_version numpy)
+  CMAKE_VERSION=$(get_conda_version cmake)
  NUMPY_VERSION=$(get_conda_version numpy)
 fi
 if [ -z "${MAX_JOBS}" ]; then
    export MAX_JOBS=$(nproc)
@ -51,12 +57,7 @@ as_jenkins git clone --recursive ${TRITON_REPO} triton
 cd triton
 as_jenkins git checkout ${TRITON_PINNED_COMMIT}
 as_jenkins git submodule update --init --recursive
-
+cd python
 # Old versions of python have setup.py in ./python; newer versions have it in ./
 if [ ! -f setup.py ]; then
  cd python
 fi
 pip_install pybind11==2.13.6
 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
@ -82,19 +83,17 @@ cp dist/*.whl /opt/triton
 # Install the wheel for docker builds that don't use multi stage
 pip_install dist/*.whl
-# TODO: This is to make sure that the same cmake and numpy version from install conda
+if [ -n "${CONDA_CMAKE}" ]; then
-# script is used. Without this step, the newer cmake version (3.25.2) downloaded by
+  # TODO: This is to make sure that the same cmake and numpy version from install conda
-# triton build step via pip will fail to detect conda MKL. Once that issue is fixed,
+  # script is used. Without this step, the newer cmake version (3.25.2) downloaded by
-# this can be removed.
+  # triton build step via pip will fail to detect conda MKL. Once that issue is fixed,
-#
+  # this can be removed.
-# The correct numpy version also needs to be set here because conda claims that it
+  #
-# causes inconsistent environment.  Without this, conda will attempt to install the
+  # The correct numpy version also needs to be set here because conda claims that it
-# latest numpy version, which fails ASAN tests with the following import error: Numba
+  # causes inconsistent environment.  Without this, conda will attempt to install the
-# needs NumPy 1.20 or less.
+  # latest numpy version, which fails ASAN tests with the following import error: Numba
-# Note that we install numpy with pip as conda might not have the version we want
+  # needs NumPy 1.20 or less.
-if [ -n "${CMAKE_VERSION}" ]; then
+  conda_reinstall cmake="${CMAKE_VERSION}"
-  pip_install "cmake==${CMAKE_VERSION}"
+  # Note that we install numpy with pip as conda might not have the version we want
-fi
+  pip_install --force-reinstall numpy=="${NUMPY_VERSION}"
 if [ -n "${NUMPY_VERSION}" ]; then
  pip_install "numpy==${NUMPY_VERSION}"
 fi
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -26,7 +26,7 @@ function install_ubuntu() {
    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
        | gpg --dearmor > /usr/share/keyrings/oneapi-archive-keyring.gpg.gpg
    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg.gpg] \
-        https://apt.repos.intel.com/oneapi all main" \
+        https://apt.repos.intel.com/${XPU_REPO_NAME} all main" \
        | tee /etc/apt/sources.list.d/oneAPI.list
    # Update the packages list and repository index
@ -74,7 +74,7 @@ function install_rhel() {
    tee > /etc/yum.repos.d/oneAPI.repo << EOF
 [oneAPI]
 name=Intel for Pytorch GPU dev repository
-baseurl=https://yum.repos.intel.com/oneapi
+baseurl=https://yum.repos.intel.com/${XPU_REPO_NAME}
 enabled=1
 gpgcheck=1
 repo_gpgcheck=1
@ -118,7 +118,7 @@ function install_sles() {
        https://repositories.intel.com/gpu/sles/${VERSION_SP}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_SP}.repo
    rpm --import https://repositories.intel.com/gpu/intel-graphics.key
    # To add the online network network package repository for the Intel Support Packages
-    zypper addrepo https://yum.repos.intel.com/oneapi oneAPI
+    zypper addrepo https://yum.repos.intel.com/${XPU_REPO_NAME} oneAPI
    rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
    # The xpu-smi packages
@ -141,10 +141,10 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
    XPU_DRIVER_VERSION=""
 fi
-# Default use Intel® oneAPI Deep Learning Essentials 2025.0
+XPU_REPO_NAME="intel-for-pytorch-gpu-dev"
-if [[ "$XPU_VERSION" == "2025.1" ]]; then
+XPU_PACKAGES="intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9"
-    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
+if [[ "$XPU_VERSION" == "2025.0" ]]; then
-else
+    XPU_REPO_NAME="oneapi"
    XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
 fi
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -51,7 +51,6 @@ ADD ./common/install_cuda.sh install_cuda.sh
 ADD ./common/install_magma.sh install_magma.sh
 COPY ./common/install_nccl.sh install_nccl.sh
 COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
 COPY ./common/install_cusparselt.sh install_cusparselt.sh
 ENV CUDA_HOME /usr/local/cuda
 FROM cuda as cuda11.8
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@ -32,8 +32,7 @@ ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
 COPY ./common/install_nccl.sh install_nccl.sh
 COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
 RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu* install_cusparselt.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
--- a/.ci/docker/linter/Dockerfile
+++ b/.ci/docker/linter/Dockerfile
@ -16,6 +16,7 @@ RUN bash ./install_user.sh && rm install_user.sh
 # Install conda and other packages (e.g., numpy, pytest)
 ARG PYTHON_VERSION
 ARG PIP_CMAKE
 ENV PATH /var/lib/jenkins/ci_env/bin:$PATH
 ENV VIRTUAL_ENV /var/lib/jenkins/ci_env
 COPY requirements-ci.txt /opt/requirements-ci.txt
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -0,0 +1,202 @@
 # syntax = docker/dockerfile:experimental
 ARG ROCM_VERSION=3.7
 ARG BASE_CUDA_VERSION=11.8
 ARG GPU_IMAGE=centos:7
 FROM centos:7 as base
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 ARG DEVTOOLSET_VERSION=9
 # Note: This is required patch since CentOS have reached EOL
 # otherwise any yum install setp will fail
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
 RUN yum install -y yum-utils centos-release-scl
 RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
 # Note: After running yum-config-manager --enable rhel-server-rhscl-7-rpms
 # patch is required once again. Somehow this steps adds mirror.centos.org
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
 ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 RUN yum --enablerepo=extras install -y epel-release
 # cmake-3.18.4 from pip
 RUN yum install -y python3-pip && \
    python3 -mpip install cmake==3.18.4 && \
    ln -s /usr/local/bin/cmake /usr/bin/cmake
 RUN yum install -y autoconf aclocal automake make sudo
 FROM base as openssl
 # Install openssl (this must precede `build python` step)
 # (In order to have a proper SSL module, Python is compiled
 # against a recent openssl [see env vars above], which is linked
 # statically. We delete openssl afterwards.)
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 # EPEL for cmake
 FROM base as patchelf
 # Install patchelf
 ADD ./common/install_patchelf.sh install_patchelf.sh
 RUN bash ./install_patchelf.sh && rm install_patchelf.sh
 RUN cp $(which patchelf) /patchelf
 FROM patchelf as python
 # build python
 COPY manywheel/build_scripts /build_scripts
 ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
 RUN bash build_scripts/build.sh && rm -r build_scripts
 FROM base as cuda
 ARG BASE_CUDA_VERSION=10.2
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
 COPY ./common/install_nccl.sh install_nccl.sh
 COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
 RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
 FROM base as intel
 # MKL
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM base as magma
 ARG BASE_CUDA_VERSION=10.2
 # Install magma
 ADD ./common/install_magma.sh install_magma.sh
 RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
 FROM base as jni
 # Install java jni header
 ADD ./common/install_jni.sh install_jni.sh
 ADD ./java/jni.h jni.h
 RUN bash ./install_jni.sh && rm install_jni.sh
 FROM base as libpng
 # Install libpng
 ADD ./common/install_libpng.sh install_libpng.sh
 RUN bash ./install_libpng.sh && rm install_libpng.sh
 FROM ${GPU_IMAGE} as common
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 RUN yum install -y \
        aclocal \
        autoconf \
        automake \
        bison \
        bzip2 \
        curl \
        diffutils \
        file \
        git \
        make \
        patch \
        perl \
        unzip \
        util-linux \
        wget \
        which \
        xz \
        yasm
 RUN yum install -y \
    https://repo.ius.io/ius-release-el7.rpm \
    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
 RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 # Install LLVM version
 COPY --from=openssl            /opt/openssl                          /opt/openssl
 COPY --from=python             /opt/python                           /opt/python
 COPY --from=python             /opt/_internal                        /opt/_internal
 COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
 COPY --from=intel              /opt/intel                            /opt/intel
 COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
 COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
 COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
 COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
 COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
 COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
 COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
 COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
 FROM common as cpu_final
 ARG BASE_CUDA_VERSION=10.1
 ARG DEVTOOLSET_VERSION=9
 # Install Anaconda
 ADD ./common/install_conda_docker.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
 ENV PATH /opt/conda/bin:$PATH
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum install -y yum-utils centos-release-scl
 RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
 ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 # cmake is already installed inside the rocm base image, so remove if present
 RUN rpm -e cmake || true
 # cmake-3.18.4 from pip
 RUN yum install -y python3-pip && \
    python3 -mpip install cmake==3.18.4 && \
    ln -s /usr/local/bin/cmake /usr/bin/cmake
 # ninja
 RUN yum install -y ninja-build
 FROM cpu_final as cuda_final
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
 ENV PATH=/usr/local/cuda/bin:$PATH
 FROM cpu_final as rocm_final
 ARG ROCM_VERSION=3.7
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 # Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0)
 # find HIP works for ROCm5.7. Not needed for ROCm6.0 and above.
 # Remove below when ROCm5.7 is not in support matrix anymore.
 ENV ROCM_PATH /opt/rocm
 ENV MKLROOT /opt/intel
 # No need to install ROCm as base docker image should have full ROCm install
 #ADD ./common/install_rocm.sh install_rocm.sh
 #RUN ROCM_VERSION=${ROCM_VERSION} bash ./install_rocm.sh && rm install_rocm.sh
 ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
 # cmake3 is needed for the MIOpen build
 RUN ln -sf /usr/local/bin/cmake /usr/bin/cmake3
 ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -7,8 +7,8 @@ ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
-ARG DEVTOOLSET_VERSION=13
+ARG DEVTOOLSET_VERSION=11
-RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
@ -33,13 +33,12 @@ RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 FROM base as cuda
-ARG BASE_CUDA_VERSION=12.6
+ARG BASE_CUDA_VERSION=11.8
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
 COPY ./common/install_nccl.sh install_nccl.sh
 COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu*
 RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu* install_cusparselt.sh
 FROM base as intel
 # MKL
@ -47,7 +46,7 @@ ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM base as magma
-ARG BASE_CUDA_VERSION=12.6
+ARG BASE_CUDA_VERSION=10.2
 # Install magma
 ADD ./common/install_magma.sh install_magma.sh
 RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
@ -64,7 +63,7 @@ ADD ./common/install_libpng.sh install_libpng.sh
 RUN bash ./install_libpng.sh && rm install_libpng.sh
 FROM ${GPU_IMAGE} as common
-ARG DEVTOOLSET_VERSION=13
+ARG DEVTOOLSET_VERSION=11
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
@ -87,12 +86,13 @@ RUN yum install -y \
        wget \
        which \
        xz \
-        glibc-langpack-en \
+        gcc-toolset-${DEVTOOLSET_VERSION}-toolchain \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gcc \
+        glibc-langpack-en
-        gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
+RUN yum install -y \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
+    https://repo.ius.io/ius-release-el7.rpm \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
 RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
@ -116,8 +116,8 @@ COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/
 COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
 FROM common as cpu_final
-ARG BASE_CUDA_VERSION=12.6
+ARG BASE_CUDA_VERSION=11.8
-ARG DEVTOOLSET_VERSION=13
+ARG DEVTOOLSET_VERSION=11
 # Install Anaconda
 ADD ./common/install_conda_docker.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
@ -156,11 +156,8 @@ ENV ROCM_PATH /opt/rocm
 # and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
 # replace the libdrm in /opt/amdgpu with custom amdgpu.ids lookup path
 ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
 # ROCm 6.4 rocm-smi depends on system drm.h header
 RUN yum install -y libdrm-devel
 ENV MKLROOT /opt/intel
 ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
@ -174,6 +171,6 @@ ENV XPU_DRIVER_TYPE ROLLING
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
 ADD ./common/install_xpu.sh install_xpu.sh
-ENV XPU_VERSION 2025.1
+ENV XPU_VERSION 2025.0
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -1,6 +1,7 @@
 FROM quay.io/pypa/manylinux_2_28_aarch64 as base
-ARG GCCTOOLSET_VERSION=13
+# Graviton needs GCC 10 or above for the build. GCC12 is the default version in almalinux-8.
 ARG GCCTOOLSET_VERSION=11
 # Language variabes
 ENV LC_ALL=en_US.UTF-8
@ -35,10 +36,7 @@ RUN yum install -y \
  yasm \
  zstd \
  sudo \
-  gcc-toolset-${GCCTOOLSET_VERSION}-gcc \
+  gcc-toolset-${GCCTOOLSET_VERSION}-toolchain
  gcc-toolset-${GCCTOOLSET_VERSION}-gcc-c++ \
  gcc-toolset-${GCCTOOLSET_VERSION}-gcc-gfortran \
  gcc-toolset-${GCCTOOLSET_VERSION}-gdb
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
@ -58,7 +56,6 @@ RUN git config --global --add safe.directory "*"
 FROM base as openblas
 # Install openblas
 ARG OPENBLAS_VERSION
 ADD ./common/install_openblas.sh install_openblas.sh
 RUN bash ./install_openblas.sh && rm install_openblas.sh
--- a/.ci/docker/manywheel/Dockerfile_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_aarch64
@ -0,0 +1,94 @@
 FROM quay.io/pypa/manylinux2014_aarch64 as base
 # Graviton needs GCC 10 for the build
 ARG DEVTOOLSET_VERSION=10
 # Language variabes
 ENV LC_ALL=en_US.UTF-8
 ENV LANG=en_US.UTF-8
 ENV LANGUAGE=en_US.UTF-8
 # Installed needed OS packages. This is to support all
 # the binary builds (torch, vision, audio, text, data)
 RUN yum -y install epel-release
 RUN yum -y update
 RUN yum install -y \
  autoconf \
  automake \
  bison \
  bzip2 \
  curl \
  diffutils \
  file \
  git \
  make \
  patch \
  perl \
  unzip \
  util-linux \
  wget \
  which \
  xz \
  yasm \
  less \
  zstd \
  libgomp \
  sudo \
  devtoolset-${DEVTOOLSET_VERSION}-gcc \
  devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ \
  devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
  devtoolset-${DEVTOOLSET_VERSION}-binutils
 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"
 ###############################################################################
 # libglfortran.a hack
 #
 # libgfortran.a from quay.io/pypa/manylinux2014_aarch64 is not compiled with -fPIC.
 # This causes __stack_chk_guard@@GLIBC_2.17 on pytorch build. To solve, get
 # ubuntu's libgfortran.a which is compiled with -fPIC
 # NOTE: Need a better way to get this library as Ubuntu's package can be removed by the vender, or changed
 ###############################################################################
 RUN cd ~/ \
  && curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-4ubuntu2_arm64.deb \
  && ar x ~/libgfortran-10-dev.deb \
  && tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ \
  && cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/
 # install cmake
 RUN yum install -y cmake3 && \
    ln -s /usr/bin/cmake3 /usr/bin/cmake
 FROM base as openssl
 # Install openssl (this must precede `build python` step)
 # (In order to have a proper SSL module, Python is compiled
 # against a recent openssl [see env vars above], which is linked
 # statically. We delete openssl afterwards.)
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 FROM base as openblas
 # Install openblas
 ADD ./common/install_openblas.sh install_openblas.sh
 RUN bash ./install_openblas.sh && rm install_openblas.sh
 FROM openssl as final
 # remove unncessary python versions
 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
 ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@ -1,7 +1,7 @@
 FROM quay.io/pypa/manylinux_2_28_aarch64 as base
 # Cuda ARM build needs gcc 11
-ARG DEVTOOLSET_VERSION=13
+ARG DEVTOOLSET_VERSION=11
 # Language variables
 ENV LC_ALL=en_US.UTF-8
@ -34,10 +34,7 @@ RUN yum install -y \
  zstd \
  libgomp \
  sudo \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gcc \
+  gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
  gcc-toolset-${DEVTOOLSET_VERSION}-gdb
 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
@ -69,11 +66,10 @@ RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 FROM base as cuda
 ARG BASE_CUDA_VERSION
 # Install CUDA
-ADD ./common/install_cuda.sh install_cuda.sh
+ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh
 COPY ./common/install_nccl.sh install_nccl.sh
 COPY ./common/install_cusparselt.sh install_cusparselt.sh
 COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu* install_cusparselt.sh
+RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh install_nccl.sh ci_commit_pins/nccl-cu*
 FROM base as magma
 ARG BASE_CUDA_VERSION
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -5,9 +5,7 @@ ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
 ENV LANGUAGE=C.UTF-8
-# there is a bugfix in gcc >= 14 for precompiled headers and s390x vectorization interaction.
+ARG DEVTOOLSET_VERSION=13
 # with earlier gcc versions test/inductor/test_cpu_cpp_wrapper.py will fail.
 ARG DEVTOOLSET_VERSION=14
 # Installed needed OS packages. This is to support all
 # the binary builds (torch, vision, audio, text, data)
 RUN yum -y install epel-release
@ -60,8 +58,7 @@ RUN yum install -y \
  libxslt-devel \
  libxml2-devel \
  openssl-devel \
-  valgrind \
+  valgrind
  ninja-build
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
@ -106,6 +103,9 @@ CMD ["/bin/bash"]
 # install test dependencies:
 # - grpcio requires system openssl, bundled crypto fails to build
 RUN dnf install -y \
  protobuf-devel \
  protobuf-c-devel \
  protobuf-lite-devel \
  hdf5-devel \
  python3-h5py \
  git
@ -129,9 +129,6 @@ RUN pip3 install flatbuffers && \
  git clone https://github.com/microsoft/onnxruntime && \
  cd onnxruntime && git checkout v1.21.0 && \
  git submodule update --init --recursive && \
-  ./build.sh --config Release --parallel 0 --enable_pybind \
+  ./build.sh --config Release --parallel 0 --enable_pybind --build_wheel --enable_training --enable_training_apis --enable_training_ops --skip_tests --allow_running_as_root && \
  --build_wheel --enable_training --enable_training_apis \
  --enable_training_ops --skip_tests --allow_running_as_root \
  --compile_no_warning_as_error && \
  pip3 install ./build/Linux/Release/dist/onnxruntime_training-*.whl && \
  cd .. && /bin/rm -rf ./onnxruntime
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -27,21 +27,25 @@ fi
 MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
 DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
 OPENBLAS_VERSION=${OPENBLAS_VERSION:-}
 case ${image} in
    manylinux2_28-builder:cpu)
        TARGET=cpu_final
        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13"
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
    manylinuxaarch64-builder:cpu-aarch64)
        TARGET=final
        GPU_IMAGE=arm64v8/centos:7
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10"
        MANY_LINUX_VERSION="aarch64"
        ;;
    manylinux2_28_aarch64-builder:cpu-aarch64)
        TARGET=final
        GPU_IMAGE=arm64v8/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11 --build-arg NINJA_VERSION=1.12.1"
        MANY_LINUX_VERSION="2_28_aarch64"
        OPENBLAS_VERSION="v0.3.29"
        ;;
    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
        TARGET=final
@ -55,27 +59,23 @@ case ${image} in
        DOCKER_GPU_BUILD_ARG=""
        MANY_LINUX_VERSION="s390x"
        ;;
-    manylinux2_28-builder:cuda11*)
+    manylinux2_28-builder:cuda*)
        TARGET=cuda_final
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
    manylinux2_28-builder:cuda12*)
        TARGET=cuda_final
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
        MANY_LINUX_VERSION="2_28"
        ;;
    manylinuxaarch64-builder:cuda*)
        TARGET=cuda_final
-        GPU_IMAGE=amd64/almalinux:8
+        GPU_IMAGE=arm64v8/centos:7
-        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="aarch64"
        DOCKERFILE_SUFFIX="_cuda_aarch64"
        ;;
    manylinux2_28-builder:rocm*)
        TARGET=rocm_final
        GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
        DEVTOOLSET_VERSION="9"
        MANY_LINUX_VERSION="2_28"
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
@ -111,7 +111,6 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
 DOCKER_BUILDKIT=1 docker build  \
    ${DOCKER_GPU_BUILD_ARG} \
    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
    --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \
    --target "${TARGET}" \
    -t "${tmp_tag}" \
    $@ \
--- a/.ci/docker/manywheel/build_scripts/build.sh
+++ b/.ci/docker/manywheel/build_scripts/build.sh
@ -97,7 +97,7 @@ find /opt/_internal -type f -print0 \
    | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
 # We do not need the Python test suites, or indeed the precompiled .pyc and
 # .pyo files. Partially cribbed from:
-#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile  # @lint-ignore
+#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
 find /opt/_internal \
     \( -type d -a -name test -o -name tests \) \
  -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
--- a/.ci/docker/manywheel/build_scripts/build_utils.sh
+++ b/.ci/docker/manywheel/build_scripts/build_utils.sh
@ -2,7 +2,7 @@
 # Helper utilities for build
 # Script used only in CD pipeline
-OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/  # @lint-ignore
+OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/
 CURL_DOWNLOAD_URL=https://curl.se/download
 AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -41,11 +41,14 @@ fbscribelogger==0.1.7
 #Pinned versions: 0.1.6
 #test that import:
-flatbuffers==24.12.23
+flatbuffers==2.0 ; platform_machine != "s390x"
 #Description: cross platform serialization library
-#Pinned versions: 24.12.23
+#Pinned versions: 2.0
 #test that import:
 flatbuffers ; platform_machine == "s390x"
 #Description: cross platform serialization library; Newer version is required on s390x for new python version
 hypothesis==5.35.1
 # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
 #Description: advanced library for generating parametrized tests
@ -90,7 +93,7 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:
-mypy==1.15.0
+mypy==1.14.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
 #Pinned versions: 1.14.0
@ -163,10 +166,10 @@ pillow==11.0.0
 #Pinned versions: 10.3.0
 #test that import:
-protobuf==5.29.4
+protobuf==3.20.2
-#Description:  Google's data interchange format
+#Description:  Google’s data interchange format
-#Pinned versions: 5.29.4
+#Pinned versions: 3.20.1
-#test that import: test_tensorboard.py, test/onnx/*
+#test that import: test_tensorboard.py
 psutil
 #Description: information on running processes and system utilization
@ -334,12 +337,12 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:
-onnx==1.18.0
+onnx==1.17.0
-#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
+#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
-onnxscript==0.2.6
+onnxscript==0.2.2
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@ -376,6 +379,3 @@ dataclasses_json==0.6.7
 #Description: required for data pipeline and scripts under tools/stats
 #Pinned versions: 0.6.7
 #test that import:
 cmake==4.0.0
 #Description: required for building
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@a98ffecb792d50df495be401becbf5c414421423#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought is probably
@ -15,10 +15,6 @@ sphinxext-opengraph==0.9.1
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 0.9.1
 sphinx_sitemap==2.6.0
 #Description: This is used to generate sitemap for PyTorch docs
 #Pinned versions: 2.6.0
 matplotlib==3.5.3
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 3.5.3
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.3.1
+3.3.0
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -26,6 +26,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
 ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 ARG CONDA_CMAKE
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
@ -42,6 +43,13 @@ ARG CLANG_VERSION
 COPY ./common/install_clang.sh install_clang.sh
 RUN bash ./install_clang.sh && rm install_clang.sh
 # (optional) Install protobuf for ONNX
 ARG PROTOBUF
 COPY ./common/install_protobuf.sh install_protobuf.sh
 RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -75,6 +83,12 @@ COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
 RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
 RUN rm install_cmake.sh
 ARG TRITON
 FROM base as triton-builder
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -27,6 +27,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
 ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 ARG CONDA_CMAKE
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
@ -42,6 +43,13 @@ ARG CLANG_VERSION
 COPY ./common/install_clang.sh install_clang.sh
 RUN bash ./install_clang.sh && rm install_clang.sh
 # (optional) Install protobuf for ONNX
 ARG PROTOBUF
 COPY ./common/install_protobuf.sh install_protobuf.sh
 RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -100,6 +108,12 @@ COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
 RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
 RUN rm install_cmake.sh
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -28,6 +28,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
 ARG CONDA_CMAKE
 ARG DOCS
 ARG BUILD_ENVIRONMENT
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
@ -83,6 +84,12 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
 RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
 RUN rm install_cmake.sh
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -28,6 +28,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
 ARG CONDA_CMAKE
 ARG DOCS
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
@ -53,8 +54,7 @@ ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
 COPY ./common/install_nccl.sh install_nccl.sh
 COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
 RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu* install_cusparselt.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
 # No effect if cuda not installed
@ -74,6 +74,13 @@ ADD ./common/install_ucc.sh install_ucc.sh
 RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
 RUN rm install_ucc.sh
 # (optional) Install protobuf for ONNX
 ARG PROTOBUF
 COPY ./common/install_protobuf.sh install_protobuf.sh
 RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -81,6 +88,12 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
 RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
 RUN rm install_cmake.sh
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -12,12 +12,13 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-w /builder \
 	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_ROCM_SHORT} \
 	-e DESIRED_ROCM=${DESIRED_ROCM} \
-	"pytorch/almalinux-builder:rocm${DESIRED_ROCM}" \
+	"pytorch/manylinux2_28-builder:rocm${DESIRED_ROCM}-main" \
 	magma-rocm/build_magma.sh
 .PHONY: all
 all: magma-rocm64
 all: magma-rocm63
 all: magma-rocm624
 .PHONY:
 clean:
@ -33,3 +34,8 @@ magma-rocm64:
 magma-rocm63: DESIRED_ROCM := 6.3
 magma-rocm63:
 	$(DOCKER_RUN)
 .PHONY: magma-rocm624
 magma-rocm624: DESIRED_ROCM := 6.2.4
 magma-rocm624:
 	$(DOCKER_RUN)
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@ -12,12 +12,13 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
 	-e DESIRED_CUDA=${DESIRED_CUDA} \
 	-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
-	"pytorch/almalinux-builder:cuda${DESIRED_CUDA}-main" \
+	"pytorch/manylinux2_28-builder:cuda${DESIRED_CUDA}-main" \
 	magma/build_magma.sh
 .PHONY: all
 all: magma-cuda128
 all: magma-cuda126
 all: magma-cuda124
 all: magma-cuda118
 .PHONY:
@ -36,6 +37,11 @@ magma-cuda126: DESIRED_CUDA := 12.6
 magma-cuda126:
 	$(DOCKER_RUN)
 .PHONY: magma-cuda124
 magma-cuda124: DESIRED_CUDA := 12.4
 magma-cuda124:
 	$(DOCKER_RUN)
 .PHONY: magma-cuda118
 magma-cuda118: DESIRED_CUDA := 11.8
 magma-cuda118: CUDA_ARCH_LIST += -gencode arch=compute_37,code=sm_37
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -18,10 +18,12 @@ retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 }
-PLATFORM=""
+PLATFORM="manylinux2014_x86_64"
 # TODO move this into the Docker images
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
-if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
    retry yum install -q -y zip openssl
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
    PLATFORM="manylinux_2_28_x86_64"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
@ -34,9 +36,6 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    retry apt-get update
    retry apt-get -y install zip openssl
 else
    echo "Unknown OS: '$OS_NAME'"
    exit 1
 fi
 # We use the package name to test the package by passing this to 'pip install'
@ -80,6 +79,8 @@ if [[ -e /opt/openssl ]]; then
    export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH
 fi
 mkdir -p /tmp/$WHEELHOUSE_DIR
 export PATCHELF_BIN=/usr/local/bin/patchelf
@ -320,8 +321,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
            # ROCm workaround for roctracer dlopens
            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                patchedpath=$(fname_without_so_number $destpath)
-            # Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
+            # Keep the so number for XPU dependencies
-            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
+            elif [[ "$DESIRED_CUDA" == *"xpu"* ]]; then
                patchedpath=$destpath
            else
                patchedpath=$(fname_with_sha256 $destpath)
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -15,9 +15,6 @@ export INSTALL_TEST=0 # dont install test binaries into site-packages
 export USE_CUPTI_SO=0
 export USE_CUSPARSELT=${USE_CUSPARSELT:-1} # Enable if not disabled by libtorch build
 export USE_CUFILE=${USE_CUFILE:-1}
 export USE_SYSTEM_NCCL=1
 export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
 export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
 # Keep an array of cmake variables to add to
 if [[ -z "$CMAKE_ARGS" ]]; then
@ -39,8 +36,10 @@ if [[ -n "$DESIRED_CUDA" ]]; then
    if [[ ${DESIRED_CUDA} =~ ^[0-9]+\.[0-9]+$ ]]; then
        CUDA_VERSION=${DESIRED_CUDA}
    else
-        # cu126, cu128 etc...
+        # cu90, cu92, cu100, cu101
-        if [[ ${#DESIRED_CUDA} -eq 5 ]]; then
+        if [[ ${#DESIRED_CUDA} -eq 4 ]]; then
            CUDA_VERSION="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3:1}"
        elif [[ ${#DESIRED_CUDA} -eq 5 ]]; then
            CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}"
        fi
    fi
@ -62,6 +61,10 @@ case ${CUDA_VERSION} in
        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
    12.4)
        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
    11.8)
        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};3.7;9.0"
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
@ -88,15 +91,14 @@ fi
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
-if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
 else
    echo "Unknown OS: '$OS_NAME'"
    exit 1
 fi
 DEPS_LIST=(
@ -106,8 +108,26 @@ DEPS_SONAME=(
    "libgomp.so.1"
 )
 # CUDA 11.8 have to ship the libcusparseLt.so.0 with the binary
 # since nvidia-cusparselt-cu11 is not available in PYPI
 if [[ $USE_CUSPARSELT == "1" && $CUDA_VERSION == "11.8" ]]; then
        DEPS_SONAME+=(
            "libcusparseLt.so.0"
        )
        DEPS_LIST+=(
            "/usr/local/cuda/lib64/libcusparseLt.so.0"
        )
 fi
-# CUDA_VERSION 12.6, 12.8
+
 # Turn USE_CUFILE off for CUDA 11.8, 12.4 since nvidia-cufile-cu11 and 1.9.0.20 are
 # not available in PYPI
 if [[ $CUDA_VERSION == "11.8" || $CUDA_VERSION == "12.4" ]]; then
    export USE_CUFILE=0
 fi
 # CUDA_VERSION 12.4, 12.6, 12.8
 if [[ $CUDA_VERSION == 12* ]]; then
    export USE_STATIC_CUDNN=0
    # Try parallelizing nvcc as well
@ -131,8 +151,6 @@ if [[ $CUDA_VERSION == 12* ]]; then
            "/usr/local/cuda/lib64/libnvToolsExt.so.1"
            "/usr/local/cuda/lib64/libnvrtc.so.12"
            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
            "/usr/local/cuda/lib64/libcufile.so.0"
            "/usr/local/cuda/lib64/libcufile_rdma.so.1"
        )
        DEPS_SONAME+=(
            "libcudnn_adv.so.9"
@ -150,9 +168,17 @@ if [[ $CUDA_VERSION == 12* ]]; then
            "libnvToolsExt.so.1"
            "libnvrtc.so.12"
            "libnvrtc-builtins.so"
            "libcufile.so.0"
            "libcufile_rdma.so.1"
        )
        if [[ $USE_CUFILE == 1 ]]; then
            DEPS_LIST+=(
                "/usr/local/cuda/lib64/libcufile.so.0"
                "/usr/local/cuda/lib64/libcufile_rdma.so.1"
            )
            DEPS_SONAME+=(
                "libcufile.so.0"
                "libcufile_rdma.so.1"
            )
        fi
    else
        echo "Using nvidia libs from pypi."
        CUDA_RPATHS=(
@ -168,38 +194,31 @@ if [[ $CUDA_VERSION == 12* ]]; then
            '$ORIGIN/../../cusparselt/lib'
            '$ORIGIN/../../nvidia/nccl/lib'
            '$ORIGIN/../../nvidia/nvtx/lib'
            '$ORIGIN/../../nvidia/cufile/lib'
        )
        if [[ $USE_CUFILE == 1 ]]; then
            CUDA_RPATHS+=(
                '$ORIGIN/../../nvidia/cufile/lib'
            )
        fi
        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
        export FORCE_RPATH="--force-rpath"
        export USE_STATIC_NCCL=0
        export USE_SYSTEM_NCCL=1
        export ATEN_STATIC_CUDA=0
        export USE_CUDA_STATIC_LINK=0
        export USE_CUPTI_SO=1
        export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
        export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
    fi
 elif [[ $CUDA_VERSION == "11.8" ]]; then
    export USE_STATIC_CUDNN=0
    # Turn USE_CUFILE off for CUDA 11.8 since nvidia-cufile-cu11 and 1.9.0.20 are
    # not available in PYPI
    export USE_CUFILE=0
    # Try parallelizing nvcc as well
    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
    # Bundle ptxas into the wheel, see https://github.com/pytorch/pytorch/pull/119750
    export BUILD_BUNDLE_PTXAS=1
    # CUDA 11.8 have to ship the libcusparseLt.so.0 with the binary
    # since nvidia-cusparselt-cu11 is not available in PYPI
    if [[ $USE_CUSPARSELT == "1" ]]; then
        DEPS_SONAME+=(
            "libcusparseLt.so.0"
        )
        DEPS_LIST+=(
            "/usr/local/cuda/lib64/libcusparseLt.so.0"
        )
    fi
    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
        echo "Bundling with cudnn and cublas."
        DEPS_LIST+=(
@ -254,9 +273,12 @@ elif [[ $CUDA_VERSION == "11.8" ]]; then
        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
        export FORCE_RPATH="--force-rpath"
        export USE_STATIC_NCCL=0
        export USE_SYSTEM_NCCL=1
        export ATEN_STATIC_CUDA=0
        export USE_CUDA_STATIC_LINK=0
        export USE_CUPTI_SO=1
        export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
        export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
    fi
 else
    echo "Unknown cuda version $CUDA_VERSION"
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -22,7 +22,9 @@ retry () {
 # TODO move this into the Docker images
 OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
-if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
    retry yum install -q -y zip openssl
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    retry dnf install -q -y zip openssl
@ -33,9 +35,6 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
    retry apt-get update
    retry apt-get -y install zip openssl
 else
    echo "Unknown OS: '$OS_NAME'"
    exit 1
 fi
 # Version: setup.py uses $PYTORCH_BUILD_VERSION.post$PYTORCH_BUILD_NUMBER if
--- a/.ci/manywheel/build_xpu.sh
+++ b/.ci/manywheel/build_xpu.sh
@ -20,11 +20,7 @@ fi
 source /opt/intel/oneapi/compiler/latest/env/vars.sh
 source /opt/intel/oneapi/pti/latest/env/vars.sh
 source /opt/intel/oneapi/umf/latest/env/vars.sh
 source /opt/intel/oneapi/ccl/latest/env/vars.sh
 source /opt/intel/oneapi/mpi/latest/env/vars.sh
 export USE_STATIC_MKL=1
 export USE_ONEMKL=1
 export USE_XCCL=1
 WHEELHOUSE_DIR="wheelhousexpu"
 LIBTORCH_HOUSE_DIR="libtorch_housexpu"
--- a/.ci/onnx/README.md
+++ b/.ci/onnx/README.md
@ -10,3 +10,5 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
 built on Jenkins and are used in triggered builds already have this
 environment variable set in their manifest. Also see
 `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
 Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -171,12 +171,6 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
  # Enable XCCL build
  export USE_XCCL=1
  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
  export USE_KINETO=0
  export TORCH_XPU_ARCH_LIST=pvc
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -302,22 +302,19 @@ except RuntimeError as e:
 fi
 ###############################################################################
-# Check for C++ ABI compatibility to GCC-11 - GCC 13
+# Check for C++ ABI compatibility to GCC-11
 ###############################################################################
 if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
  pushd /tmp
-  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
+  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html gcc-11 is ABI16
-  # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
+  # Though manylinux_2.28 should have been build with gcc-14, per
-  # gcc 11 - CUDA 11.8, xpu, rocm
+  # https://github.com/pypa/manylinux?tab=readme-ov-file#manylinux_2_28-almalinux-8-based
-  # gcc 13 - CUDA 12.6, 12.8 and cpu
+  # On s390x gcc 14 is used because it contains fix for interaction
-  # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
+  # between precompiled headers and vectorization builtins.
-  if [[ "$(uname -m)" == "s390x" ]]; then
+  # This fix is not available in earlier gcc versions.
-    cxx_abi="19"
+  # gcc-14 uses ABI19.
-  elif [[ "$DESIRED_CUDA" != 'cu118' && "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
+  if [[ "$(uname -m)" != "s390x" ]]; then
-    cxx_abi="18"
+    python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi1016' else 1)"
  else
    cxx_abi="16"
  fi
  python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
  popd
 fi
--- a/.ci/pytorch/common.sh
+++ b/.ci/pytorch/common.sh
@ -13,6 +13,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
  # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
  unset HIP_PLATFORM
  export PYTORCH_TEST_WITH_ROCM=1
  # temporary to locate some kernel issues on the CI nodes
  export HSAKMT_DEBUG_LEVEL=4
  # improve rccl performance for distributed tests
  export HSA_FORCE_FINE_GRAIN_PCIE=1
 fi
 # TODO: Renable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598
--- a/.ci/pytorch/install_cache_xla.sh
+++ b/.ci/pytorch/install_cache_xla.sh
@ -1,50 +1,31 @@
 #!/bin/bash
 # Script for installing sccache on the xla build job, which uses xla's docker
-# image, which has sccache installed but doesn't write the stubs.  This is
+# image and doesn't have sccache installed on it.  This is mostly copied from
-# mostly copied from .ci/docker/install_cache.sh.  Changes are: removing checks
+# .ci/docker/install_cache.sh.  Changes are: removing checks that will always
-# that will always return the same thing, ex checks for for rocm, CUDA, changing
+# return the same thing, ex checks for for rocm, CUDA, and changing the path
-# the path where sccache is installed, not changing /etc/environment, and not
+# where sccache is installed, and not changing /etc/environment.
 # installing/downloading sccache as it is already in the docker image.
 set -ex -o pipefail
 install_binary() {
  echo "Downloading sccache binary from S3 repo"
  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /tmp/cache/bin/sccache
 }
 mkdir -p /tmp/cache/bin
 mkdir -p /tmp/cache/lib
 export PATH="/tmp/cache/bin:$PATH"
 install_binary
 chmod a+x /tmp/cache/bin/sccache
 function write_sccache_stub() {
  # Unset LD_PRELOAD for ps because of asan + ps issues
  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
-  if [ "$1" == "gcc" ]; then
+  # shellcheck disable=SC2086
-    # Do not call sccache recursively when dumping preprocessor argument
+  # shellcheck disable=SC2059
-    # For some reason it's very important for the first cached nvcc invocation
+  printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/tmp/cache/bin/$1"
    cat >"/tmp/cache/bin/$1" <<EOF
 #!/bin/sh
 # sccache does not support -E flag, so we need to call the original compiler directly in order to avoid calling this wrapper recursively
 for arg in "\$@"; do
  if [ "\$arg" = "-E" ]; then
    exec $(which "$1") "\$@"
  fi
 done
 if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
  exec sccache $(which "$1") "\$@"
 else
  exec $(which "$1") "\$@"
 fi
 EOF
  else
    cat >"/tmp/cache/bin/$1" <<EOF
 #!/bin/sh
 if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
  exec sccache $(which "$1") "\$@"
 else
  exec $(which "$1") "\$@"
 fi
 EOF
  fi
  chmod a+x "/tmp/cache/bin/$1"
 }
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -40,7 +40,7 @@ if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
 else
  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
-  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
+  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
 fi
 if which sccache > /dev/null; then
  print_sccache_stats
--- a/.ci/pytorch/macos-common.sh
+++ b/.ci/pytorch/macos-common.sh
@ -20,4 +20,14 @@ print_cmake_info() {
  CONDA_INSTALLATION_DIR=$(dirname "$CMAKE_EXEC")
  # Print all libraries under cmake rpath for debugging
  ls -la "$CONDA_INSTALLATION_DIR/../lib"
  export CMAKE_EXEC
  # Explicitly add conda env lib folder to cmake rpath to address the flaky issue
  # where cmake dependencies couldn't be found. This seems to point to how conda
  # links $CMAKE_EXEC to its package cache when cloning a new environment
  install_name_tool -add_rpath @executable_path/../lib "${CMAKE_EXEC}" || true
  # Adding the rpath will invalidate cmake signature, so signing it again here
  # to trust the executable. EXC_BAD_ACCESS (SIGKILL (Code Signature Invalid))
  # with an exit code 137 otherwise
  codesign -f -s - "${CMAKE_EXEC}" || true
 }
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -42,16 +42,6 @@ test_python_all() {
  assert_git_not_dirty
 }
 test_python_mps() {
  setup_test_python
  time python test/run_test.py --verbose --mps
  MTL_CAPTURE_ENABLED=1 ${CONDA_RUN} python3 test/test_mps.py --verbose -k test_metal_capture
  assert_git_not_dirty
 }
 test_python_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
@ -165,7 +155,6 @@ test_jit_hooks() {
 torchbench_setup_macos() {
  git clone --recursive https://github.com/pytorch/vision torchvision
  git clone --recursive https://github.com/pytorch/audio torchaudio
  brew install jpeg-turbo libpng
  pushd torchvision
  git fetch
@ -180,8 +169,7 @@ torchbench_setup_macos() {
  git checkout "$(cat ../.github/ci_commit_pins/audio.txt)"
  git submodule update --init --recursive
  python setup.py clean
-  #TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp
+  python setup.py develop
  USE_OPENMP=0 python setup.py develop
  popd
  # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
@ -189,8 +177,9 @@ torchbench_setup_macos() {
  checkout_install_torchbench
 }
-pip_benchmark_deps() {
+conda_benchmark_deps() {
-  python -mpip install --no-input astunparse requests cython scikit-learn
+  conda install -y astunparse numpy scipy ninja pyyaml setuptools cmake typing-extensions requests protobuf numba cython scikit-learn
  conda install -y -c conda-forge librosa
 }
@ -198,7 +187,7 @@ test_torchbench_perf() {
  print_cmake_info
  echo "Launching torchbench setup"
-  pip_benchmark_deps
+  conda_benchmark_deps
  torchbench_setup_macos
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -225,7 +214,7 @@ test_torchbench_smoketest() {
  print_cmake_info
  echo "Launching torchbench setup"
-  pip_benchmark_deps
+  conda_benchmark_deps
  # shellcheck disable=SC2119,SC2120
  torchbench_setup_macos
@ -233,8 +222,7 @@ test_torchbench_smoketest() {
  mkdir -p "$TEST_REPORTS_DIR"
  local device=mps
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor)
+  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam pytorch_unet stable_diffusion_text_encoder moco speech_transformer)
  local hf_models=(GoogleFnet YituTechConvBert Speech2Text2ForCausalLM)
  for backend in eager inductor; do
@ -249,21 +237,6 @@ test_torchbench_smoketest() {
        PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
          --performance --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
          --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" || true
        if [ "$backend" == "inductor" ]; then
          PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
            --accuracy --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
            --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
        fi
      done
      for model in "${hf_models[@]}"; do
        if [ "$backend" == "inductor" ]; then
          PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
            --performance --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
            --output "$TEST_REPORTS_DIR/inductor_${backend}_huggingface_${dtype}_inference_${device}_performance.csv" || true
          PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
            --accuracy --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
            --output "$TEST_REPORTS_DIR/inductor_${backend}_huggingface_${dtype}_inference_${device}_accuracy.csv" || true
        fi
      done
    done
@ -290,7 +263,7 @@ test_hf_perf() {
  print_cmake_info
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
-  pip_benchmark_deps
+  conda_benchmark_deps
  torchbench_setup_macos
  echo "Launching HuggingFace training perf run"
@ -306,7 +279,7 @@ test_timm_perf() {
  print_cmake_info
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
-  pip_benchmark_deps
+  conda_benchmark_deps
  torchbench_setup_macos
  echo "Launching timm training perf run"
@ -332,8 +305,6 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
  test_timm_perf
 elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
  test_torchbench_smoketest
 elif [[ $TEST_CONFIG == *"mps"* ]]; then
  test_python_mps
 elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
  test_python_shard "${SHARD_NUMBER}"
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
--- a/.ci/pytorch/perf_test/common.sh
+++ b/.ci/pytorch/perf_test/common.sh
@ -0,0 +1,22 @@
 #!/bin/bash
 set -e
 run_test () {
  rm -rf test_tmp/ && mkdir test_tmp/ && cd test_tmp/
  "$@"
  cd .. && rm -rf test_tmp/
 }
 get_runtime_of_command () {
  TIMEFORMAT=%R
  # runtime=$( { time ($@ &> /dev/null); } 2>&1 1>/dev/null)
  runtime=$( { time "$@"; } 2>&1 1>/dev/null)
  if [[ $runtime == *"Error"* ]]; then
    exit 1
  fi
  runtime=${runtime#+++ $@}
  runtime=$(python -c "print($runtime)")
  echo "$runtime"
 }
--- a/.ci/pytorch/perf_test/compare_with_baseline.py
+++ b/.ci/pytorch/perf_test/compare_with_baseline.py
@ -0,0 +1,91 @@
 import argparse
 import json
 import math
 import sys
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "--test-name", dest="test_name", action="store", required=True, help="test name"
 )
 parser.add_argument(
    "--sample-stats",
    dest="sample_stats",
    action="store",
    required=True,
    help="stats from sample",
 )
 parser.add_argument(
    "--update",
    action="store_true",
    help="whether to update baseline using stats from sample",
 )
 args = parser.parse_args()
 test_name = args.test_name
 if "cpu" in test_name:
    backend = "cpu"
 elif "gpu" in test_name:
    backend = "gpu"
 data_file_path = f"../{backend}_runtime.json"
 with open(data_file_path) as data_file:
    data = json.load(data_file)
 if test_name in data:
    mean = float(data[test_name]["mean"])
    sigma = float(data[test_name]["sigma"])
 else:
    # Let the test pass if baseline number doesn't exist
    mean = sys.maxsize
    sigma = 0.001
 print("population mean: ", mean)
 print("population sigma: ", sigma)
 # Let the test pass if baseline number is NaN (which happened in
 # the past when we didn't have logic for catching NaN numbers)
 if math.isnan(mean) or math.isnan(sigma):
    mean = sys.maxsize
    sigma = 0.001
 sample_stats_data = json.loads(args.sample_stats)
 sample_mean = float(sample_stats_data["mean"])
 sample_sigma = float(sample_stats_data["sigma"])
 print("sample mean: ", sample_mean)
 print("sample sigma: ", sample_sigma)
 if math.isnan(sample_mean):
    raise Exception("""Error: sample mean is NaN""")  # noqa: TRY002
 elif math.isnan(sample_sigma):
    raise Exception("""Error: sample sigma is NaN""")  # noqa: TRY002
 z_value = (sample_mean - mean) / sigma
 print("z-value: ", z_value)
 if z_value >= 3:
    raise Exception(  # noqa: TRY002
        f"""\n
 z-value >= 3, there is high chance of perf regression.\n
 To reproduce this regression, run
 `cd .ci/pytorch/perf_test/ && bash {test_name}.sh` on your local machine
 and compare the runtime before/after your code change.
 """
    )
 else:
    print("z-value < 3, no perf regression detected.")
    if args.update:
        print("We will use these numbers as new baseline.")
        new_data_file_path = f"../new_{backend}_runtime.json"
        with open(new_data_file_path) as new_data_file:
            new_data = json.load(new_data_file)
        new_data[test_name] = {}
        new_data[test_name]["mean"] = sample_mean
        new_data[test_name]["sigma"] = max(sample_sigma, sample_mean * 0.1)
        with open(new_data_file_path, "w") as new_data_file:
            json.dump(new_data, new_data_file, indent=4)
--- a/.ci/pytorch/perf_test/get_stats.py
+++ b/.ci/pytorch/perf_test/get_stats.py
@ -0,0 +1,18 @@
 import json
 import sys
 import numpy
 sample_data_list = sys.argv[1:]
 sample_data_list = [float(v.strip()) for v in sample_data_list]
 sample_mean = numpy.mean(sample_data_list)
 sample_sigma = numpy.std(sample_data_list)
 data = {
    "mean": sample_mean,
    "sigma": sample_sigma,
 }
 print(json.dumps(data))
--- a/.ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh
@ -0,0 +1,43 @@
 #!/bin/bash
 set -e
 . ./common.sh
 test_cpu_speed_mini_sequence_labeler () {
  echo "Testing: mini sequence labeler, CPU"
  export OMP_NUM_THREADS=4
  export MKL_NUM_THREADS=4
  git clone https://github.com/pytorch/benchmark.git
  cd benchmark/
  git checkout 726567a455edbfda6199445922a8cfee82535664
  cd scripts/mini_sequence_labeler
  SAMPLE_ARRAY=()
  NUM_RUNS=$1
  for (( i=1; i<=NUM_RUNS; i++ )) do
    runtime=$(get_runtime_of_command python main.py)
    SAMPLE_ARRAY+=("${runtime}")
  done
  cd ../../..
  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
  echo "Runtime stats in seconds:"
  echo "$stats"
  if [ "$2" == "compare_with_baseline" ]; then
    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
  elif [ "$2" == "compare_and_update" ]; then
    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
  fi
 }
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
  run_test test_cpu_speed_mini_sequence_labeler "$@"
 fi
--- a/.ci/pytorch/perf_test/test_cpu_speed_mnist.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_mnist.sh
@ -0,0 +1,45 @@
 #!/bin/bash
 set -e
 . ./common.sh
 test_cpu_speed_mnist () {
  echo "Testing: MNIST, CPU"
  export OMP_NUM_THREADS=4
  export MKL_NUM_THREADS=4
  git clone https://github.com/pytorch/examples.git -b perftests
  cd examples/mnist
  conda install -c pytorch torchvision-cpu
  # Download data
  python main.py --epochs 0
  SAMPLE_ARRAY=()
  NUM_RUNS=$1
  for (( i=1; i<=NUM_RUNS; i++ )) do
    runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
    echo "$runtime"
    SAMPLE_ARRAY+=("${runtime}")
  done
  cd ../..
  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
  echo "Runtime stats in seconds:"
  echo "$stats"
  if [ "$2" == "compare_with_baseline" ]; then
    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
  elif [ "$2" == "compare_and_update" ]; then
    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
  fi
 }
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
  run_test test_cpu_speed_mnist "$@"
 fi
--- a/.ci/pytorch/perf_test/test_cpu_speed_torch.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_torch.sh
@ -0,0 +1,29 @@
 #!/bin/bash
 . ./common.sh
 test_cpu_speed_torch () {
  echo "Testing: torch.*, CPU"
  export OMP_NUM_THREADS=4
  export MKL_NUM_THREADS=4
  git clone https://github.com/yf225/perf-tests.git
  if [ "$1" == "compare_with_baseline" ]; then
    export ARGS=(--compare ../cpu_runtime.json)
  elif [ "$1" == "compare_and_update" ]; then
    export ARGS=(--compare ../cpu_runtime.json --update ../new_cpu_runtime.json)
  elif [ "$1" == "update_only" ]; then
    export ARGS=(--update ../new_cpu_runtime.json)
  fi
  if ! python perf-tests/modules/test_cpu_torch.py "${ARGS[@]}"; then
    echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
    exit 1
  fi
 }
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
  run_test test_cpu_speed_torch "$@"
 fi
--- a/.ci/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
@ -0,0 +1,29 @@
 #!/bin/bash
 . ./common.sh
 test_cpu_speed_torch_tensor () {
  echo "Testing: torch.Tensor.*, CPU"
  export OMP_NUM_THREADS=4
  export MKL_NUM_THREADS=4
  git clone https://github.com/yf225/perf-tests.git
  if [ "$1" == "compare_with_baseline" ]; then
    export ARGS=(--compare ../cpu_runtime.json)
  elif [ "$1" == "compare_and_update" ]; then
    export ARGS=(--compare ../cpu_runtime.json --update ../new_cpu_runtime.json)
  elif [ "$1" == "update_only" ]; then
    export ARGS=(--update ../new_cpu_runtime.json)
  fi
  if ! python perf-tests/modules/test_cpu_torch_tensor.py "${ARGS[@]}"; then
    echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
    exit 1
  fi
 }
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
  run_test test_cpu_speed_torch_tensor "$@"
 fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
@ -0,0 +1,44 @@
 #!/bin/bash
 set -e
 . ./common.sh
 test_gpu_speed_cudnn_lstm () {
  echo "Testing: CuDNN LSTM, GPU"
  export OMP_NUM_THREADS=4
  export MKL_NUM_THREADS=4
  git clone https://github.com/pytorch/benchmark.git
  cd benchmark/
  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
  cd scripts/
  SAMPLE_ARRAY=()
  NUM_RUNS=$1
  for (( i=1; i<=NUM_RUNS; i++ )) do
    runtime=$(get_runtime_of_command python cudnn_lstm.py --skip-cpu-governor-check)
    echo "$runtime"
    SAMPLE_ARRAY+=("${runtime}")
  done
  cd ../..
  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
  echo "Runtime stats in seconds:"
  echo "$stats"
  if [ "$2" == "compare_with_baseline" ]; then
    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
  elif [ "$2" == "compare_and_update" ]; then
    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
  fi
 }
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
  run_test test_gpu_speed_cudnn_lstm "$@"
 fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_lstm.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_lstm.sh
@ -0,0 +1,44 @@
 #!/bin/bash
 set -e
 . ./common.sh
 test_gpu_speed_lstm () {
  echo "Testing: LSTM, GPU"
  export OMP_NUM_THREADS=4
  export MKL_NUM_THREADS=4
  git clone https://github.com/pytorch/benchmark.git
  cd benchmark/
  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
  cd scripts/
  SAMPLE_ARRAY=()
  NUM_RUNS=$1
  for (( i=1; i<=NUM_RUNS; i++ )) do
    runtime=$(get_runtime_of_command python lstm.py --skip-cpu-governor-check)
    echo "$runtime"
    SAMPLE_ARRAY+=("${runtime}")
  done
  cd ../..
  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
  echo "Runtime stats in seconds:"
  echo "$stats"
  if [ "$2" == "compare_with_baseline" ]; then
    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
  elif [ "$2" == "compare_and_update" ]; then
    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
  fi
 }
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
  run_test test_gpu_speed_lstm "$@"
 fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_mlstm.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_mlstm.sh
@ -0,0 +1,44 @@
 #!/bin/bash
 set -e
 . ./common.sh
 test_gpu_speed_mlstm () {
  echo "Testing: MLSTM, GPU"
  export OMP_NUM_THREADS=4
  export MKL_NUM_THREADS=4
  git clone https://github.com/pytorch/benchmark.git
  cd benchmark/
  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
  cd scripts/
  SAMPLE_ARRAY=()
  NUM_RUNS=$1
  for (( i=1; i<=NUM_RUNS; i++ )) do
    runtime=$(get_runtime_of_command python mlstm.py --skip-cpu-governor-check)
    echo "$runtime"
    SAMPLE_ARRAY+=("${runtime}")
  done
  cd ../..
  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
  echo "Runtime stats in seconds:"
  echo "$stats"
  if [ "$2" == "compare_with_baseline" ]; then
    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
  elif [ "$2" == "compare_and_update" ]; then
    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
  fi
 }
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
  run_test test_gpu_speed_mlstm "$@"
 fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_mnist.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_mnist.sh
@ -0,0 +1,48 @@
 #!/bin/bash
 set -e
 . ./common.sh
 test_gpu_speed_mnist () {
  echo "Testing: MNIST, GPU"
  export OMP_NUM_THREADS=4
  export MKL_NUM_THREADS=4
  git clone https://github.com/pytorch/examples.git -b perftests
  cd examples/mnist
  conda install -c pytorch torchvision
  # Download data
  python main.py --epochs 0
  SAMPLE_ARRAY=()
  NUM_RUNS=$1
  # Needs warm up to get accurate number
  python main.py --epochs 1 --no-log
  for (( i=1; i<=NUM_RUNS; i++ )) do
    runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
    echo "$runtime"
    SAMPLE_ARRAY+=("${runtime}")
  done
  cd ../..
  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
  echo "Runtime stats in seconds:"
  echo "$stats"
  if [ "$2" == "compare_with_baseline" ]; then
    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
  elif [ "$2" == "compare_and_update" ]; then
    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
  fi
 }
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
  run_test test_gpu_speed_mnist "$@"
 fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_word_language_model.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_word_language_model.sh
@ -0,0 +1,53 @@
 #!/bin/bash
 set -e
 . ./common.sh
 test_gpu_speed_word_language_model () {
  echo "Testing: word language model on Wikitext-2, GPU"
  export OMP_NUM_THREADS=4
  export MKL_NUM_THREADS=4
  git clone https://github.com/pytorch/examples.git -b perftests
  cd examples/word_language_model
  cd data/wikitext-2
  # Reduce dataset size, so that we can have more runs per test
  sed -n '1,200p' test.txt > test_tmp.txt
  sed -n '1,1000p' train.txt > train_tmp.txt
  sed -n '1,200p' valid.txt > valid_tmp.txt
  mv test_tmp.txt test.txt
  mv train_tmp.txt train.txt
  mv valid_tmp.txt valid.txt
  cd ../..
  SAMPLE_ARRAY=()
  NUM_RUNS=$1
  for (( i=1; i<=NUM_RUNS; i++ )) do
    runtime=$(get_runtime_of_command python main.py --cuda --epochs 1)
    echo "$runtime"
    SAMPLE_ARRAY+=("${runtime}")
  done
  cd ../..
  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
  echo "Runtime stats in seconds:"
  echo "$stats"
  if [ "$2" == "compare_with_baseline" ]; then
    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
  elif [ "$2" == "compare_and_update" ]; then
    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
  fi
 }
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
  run_test test_gpu_speed_word_language_model "$@"
 fi
--- a/.ci/pytorch/perf_test/update_commit_hash.py
+++ b/.ci/pytorch/perf_test/update_commit_hash.py
@ -0,0 +1,14 @@
 import json
 import sys
 data_file_path = sys.argv[1]
 commit_hash = sys.argv[2]
 with open(data_file_path) as data_file:
    data = json.load(data_file)
 data["commit"] = commit_hash
 with open(data_file_path, "w") as data_file:
    json.dump(data, data_file)
--- a/.ci/pytorch/run_tests.sh
+++ b/.ci/pytorch/run_tests.sh
@ -76,7 +76,7 @@ fi
 # Environment initialization
 if [[ "$(uname)" == Darwin ]]; then
    # Install the testing dependencies
-    retry pip install -q future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml
+    retry conda install -yq future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml
 else
    retry pip install -qr requirements.txt || true
    retry pip install -q hypothesis protobuf pytest setuptools || true
@ -91,6 +91,7 @@ fi
 echo "Testing with:"
 pip freeze
 conda list || true
 ##############################################################################
 # Smoke tests
--- a/.ci/pytorch/short-perf-test-cpu.sh
+++ b/.ci/pytorch/short-perf-test-cpu.sh
@ -0,0 +1,71 @@
 #!/bin/bash
 SCRIPT_PARENT_DIR=$(dirname "${BASH_SOURCE[0]}")
 # shellcheck source=.ci/pytorch/common.sh
 source "$SCRIPT_PARENT_DIR/common.sh"
 cd .ci/pytorch/perf_test
 echo "Running CPU perf test for PyTorch..."
 pip install -q awscli
 # Set multipart_threshold to be sufficiently high, so that `aws s3 cp` is not a multipart read
 # More info at https://github.com/aws/aws-cli/issues/2321
 aws configure set default.s3.multipart_threshold 5GB
 UPSTREAM_DEFAULT_BRANCH="$(git remote show https://github.com/pytorch/pytorch.git | awk '/HEAD branch/ {print $NF}')"
 if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
    # Get current default branch commit hash
    DEFAULT_BRANCH_COMMIT_ID=$(git log --format="%H" -n 1)
    export DEFAULT_BRANCH_COMMIT_ID
 fi
 # Find the default branch commit to test against
 git remote add upstream https://github.com/pytorch/pytorch.git
 git fetch upstream
 IFS=$'\n'
 while IFS='' read -r commit_id; do
    if aws s3 ls s3://ossci-perf-test/pytorch/cpu_runtime/"${commit_id}".json; then
        LATEST_TESTED_COMMIT=${commit_id}
        break
    fi
 done < <(git rev-list upstream/"$UPSTREAM_DEFAULT_BRANCH")
 aws s3 cp s3://ossci-perf-test/pytorch/cpu_runtime/"${LATEST_TESTED_COMMIT}".json cpu_runtime.json
 if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
    # Prepare new baseline file
    cp cpu_runtime.json new_cpu_runtime.json
    python update_commit_hash.py new_cpu_runtime.json "${DEFAULT_BRANCH_COMMIT_ID}"
 fi
 # Include tests
 # shellcheck source=./perf_test/test_cpu_speed_mini_sequence_labeler.sh
 . ./test_cpu_speed_mini_sequence_labeler.sh
 # shellcheck source=./perf_test/test_cpu_speed_mnist.sh
 . ./test_cpu_speed_mnist.sh
 # shellcheck source=./perf_test/test_cpu_speed_torch.sh
 . ./test_cpu_speed_torch.sh
 # shellcheck source=./perf_test/test_cpu_speed_torch_tensor.sh
 . ./test_cpu_speed_torch_tensor.sh
 # Run tests
 export TEST_MODE="compare_with_baseline"
 if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
    export TEST_MODE="compare_and_update"
 fi
 # Operator tests
 run_test test_cpu_speed_torch ${TEST_MODE}
 run_test test_cpu_speed_torch_tensor ${TEST_MODE}
 # Sample model tests
 run_test test_cpu_speed_mini_sequence_labeler 20 ${TEST_MODE}
 run_test test_cpu_speed_mnist 20 ${TEST_MODE}
 if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
    # This could cause race condition if we are testing the same default branch commit twice,
    # but the chance of them executing this line at the same time is low.
    aws s3 cp new_cpu_runtime.json s3://ossci-perf-test/pytorch/cpu_runtime/"${DEFAULT_BRANCH_COMMIT_ID}".json --acl public-read
 fi
--- a/.ci/pytorch/short-perf-test-gpu.sh
+++ b/.ci/pytorch/short-perf-test-gpu.sh
@ -0,0 +1,76 @@
 #!/bin/bash
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 pushd .ci/pytorch/perf_test
 echo "Running GPU perf test for PyTorch..."
 # Trying to uninstall PyYAML can cause problem. Workaround according to:
 # https://github.com/pypa/pip/issues/5247#issuecomment-415571153
 pip install -q awscli --ignore-installed PyYAML
 # Set multipart_threshold to be sufficiently high, so that `aws s3 cp` is not a multipart read
 # More info at https://github.com/aws/aws-cli/issues/2321
 aws configure set default.s3.multipart_threshold 5GB
 UPSTREAM_DEFAULT_BRANCH="$(git remote show https://github.com/pytorch/pytorch.git | awk '/HEAD branch/ {print $NF}')"
 if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
    # Get current default branch commit hash
    DEFAULT_BRANCH_COMMIT_ID=$(git log --format="%H" -n 1)
    export DEFAULT_BRANCH_COMMIT_ID
 fi
 # Find the default branch commit to test against
 git remote add upstream https://github.com/pytorch/pytorch.git
 git fetch upstream
 IFS=$'\n'
 while IFS='' read -r commit_id; do
    if aws s3 ls s3://ossci-perf-test/pytorch/gpu_runtime/"${commit_id}".json; then
        LATEST_TESTED_COMMIT=${commit_id}
        break
    fi
 done < <(git rev-list upstream/"$UPSTREAM_DEFAULT_BRANCH")
 aws s3 cp s3://ossci-perf-test/pytorch/gpu_runtime/"${LATEST_TESTED_COMMIT}".json gpu_runtime.json
 if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
    # Prepare new baseline file
    cp gpu_runtime.json new_gpu_runtime.json
    python update_commit_hash.py new_gpu_runtime.json "${DEFAULT_BRANCH_COMMIT_ID}"
 fi
 # Include tests
 # shellcheck source=./perf_test/test_gpu_speed_mnist.sh
 . ./test_gpu_speed_mnist.sh
 # shellcheck source=./perf_test/test_gpu_speed_word_language_model.sh
 . ./test_gpu_speed_word_language_model.sh
 # shellcheck source=./perf_test/test_gpu_speed_cudnn_lstm.sh
 . ./test_gpu_speed_cudnn_lstm.sh
 # shellcheck source=./perf_test/test_gpu_speed_lstm.sh
 . ./test_gpu_speed_lstm.sh
 # shellcheck source=./perf_test/test_gpu_speed_mlstm.sh
 . ./test_gpu_speed_mlstm.sh
 # Run tests
 if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
    run_test test_gpu_speed_mnist 20 compare_and_update
    run_test test_gpu_speed_word_language_model 20 compare_and_update
    run_test test_gpu_speed_cudnn_lstm 20 compare_and_update
    run_test test_gpu_speed_lstm 20 compare_and_update
    run_test test_gpu_speed_mlstm 20 compare_and_update
 else
    run_test test_gpu_speed_mnist 20 compare_with_baseline
    run_test test_gpu_speed_word_language_model 20 compare_with_baseline
    run_test test_gpu_speed_cudnn_lstm 20 compare_with_baseline
    run_test test_gpu_speed_lstm 20 compare_with_baseline
    run_test test_gpu_speed_mlstm 20 compare_with_baseline
 fi
 if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
    # This could cause race condition if we are testing the same default branch commit twice,
    # but the chance of them executing this line at the same time is low.
    aws s3 cp new_gpu_runtime.json s3://ossci-perf-test/pytorch/gpu_runtime/"${DEFAULT_BRANCH_COMMIT_ID}".json --acl public-read
 fi
 popd
--- a/.ci/pytorch/smoke_test/check_gomp.py
+++ b/.ci/pytorch/smoke_test/check_gomp.py
@ -1,74 +0,0 @@
 import ctypes
 import os
 import sys
 from pathlib import Path
 def get_gomp_thread():
    """
    Retrieves the maximum number of OpenMP threads after loading the `libgomp.so.1` library
    and the `libtorch_cpu.so` library. It then queries the
    maximum number of threads available for OpenMP parallel regions using the
    `omp_get_max_threads` function.
    Returns:
        int: The maximum number of OpenMP threads available.
    Notes:
        - The function assumes the default path for `libgomp.so.1` on AlmaLinux OS.
        - The path to `libtorch_cpu.so` is constructed based on the Python executable's
          installation directory.
        - This function is specific to environments where PyTorch and OpenMP are used
          together and may require adjustments for other setups.
    """
    python_path = Path(sys.executable).resolve()
    python_prefix = (
        python_path.parent.parent
    )  # Typically goes to the Python installation root
    # Get the additional ABI flags (if any); it may be an empty string.
    abiflags = getattr(sys, "abiflags", "")
    # Construct the Python directory name correctly (e.g., "python3.13t").
    python_version = (
        f"python{sys.version_info.major}.{sys.version_info.minor}{abiflags}"
    )
    libtorch_cpu_path = (
        python_prefix
        / "lib"
        / python_version
        / "site-packages"
        / "torch"
        / "lib"
        / "libtorch_cpu.so"
    )
    # use the default gomp path of AlmaLinux OS
    libgomp_path = "/usr/lib64/libgomp.so.1"
    os.environ["GOMP_CPU_AFFINITY"] = "0-3"
    libgomp = ctypes.CDLL(libgomp_path)
    libgomp = ctypes.CDLL(libtorch_cpu_path)
    libgomp.omp_get_max_threads.restype = ctypes.c_int
    libgomp.omp_get_max_threads.argtypes = []
    omp_max_threads = libgomp.omp_get_max_threads()
    return omp_max_threads
 def main():
    omp_max_threads = get_gomp_thread()
    print(
        f"omp_max_threads after loading libgomp.so and libtorch_cpu.so: {omp_max_threads}"
    )
    if omp_max_threads == 1:
        raise RuntimeError(
            "omp_max_threads is 1. Check whether libgomp.so is loaded twice."
        )
 if __name__ == "__main__":
    main()
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -191,10 +191,6 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
    # shellcheck disable=SC1091
    source /opt/intel/oneapi/umf/latest/env/vars.sh
  fi
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
  # Check XPU status before testing
  xpu-smi discovery
 fi
@ -318,18 +314,6 @@ test_python() {
  assert_git_not_dirty
 }
 test_python_smoke() {
  # Smoke tests for H100
  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }
 test_h100_distributed() {
  # Distributed tests at H100
  time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }
 test_lazy_tensor_meta_reference_disabled() {
  export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
  echo "Testing lazy tensor operations without meta reference"
@ -414,15 +398,8 @@ test_inductor_aoti() {
    # We need to hipify before building again
    python3 tools/amd_build/build_amd.py
  fi
-  if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
+  BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
-    BUILD_AOT_INDUCTOR_TEST=1 TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop
+  CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
    # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
    LD_LIBRARY_PATH=/opt/conda/envs/py_3.10/lib/:${TORCH_LIB_DIR}:$LD_LIBRARY_PATH
    CPP_TESTS_DIR="${BUILD_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
  else
    BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
    CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
  fi
 }
 test_inductor_cpp_wrapper_shard() {
@ -437,11 +414,10 @@ test_inductor_cpp_wrapper_shard() {
  if [[ "$1" -eq "2" ]]; then
    # For now, manually put the opinfo tests in shard 2, and all other tests in
-    # shard 1.  Run all CPU tests, as well as specific GPU tests triggering past
+    # shard 1.  Test specific things triggering past bugs, for now.
    # bugs, for now.
    python test/run_test.py \
      --include inductor/test_torchinductor_opinfo \
-      -k 'linalg or to_sparse or TestInductorOpInfoCPU' \
+      -k 'linalg or to_sparse' \
      --verbose
    exit
  fi
@ -601,6 +577,7 @@ test_perf_for_dashboard() {
    elif [[ "${TEST_CONFIG}" == *cpu_aarch64* ]]; then
      device=cpu_aarch64
    fi
    test_inductor_set_cpu_affinity
  elif [[ "${TEST_CONFIG}" == *cuda_a10g* ]]; then
    device=cuda_a10g
  elif [[ "${TEST_CONFIG}" == *h100* ]]; then
@ -609,9 +586,6 @@ test_perf_for_dashboard() {
    device=rocm
  fi
  # Always set CPU affinity because metrics like compilation time requires CPU
  test_inductor_set_cpu_affinity
  for mode in "${modes[@]}"; do
    if [[ "$mode" == "inference" ]]; then
      dtype=bfloat16
@ -828,7 +802,16 @@ test_inductor_torchbench_smoketest_perf() {
  done
 }
 test_inductor_get_core_number() {
  if [[ "${TEST_CONFIG}" == *aarch64* ]]; then
    echo "$(($(lscpu | grep 'Cluster(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per cluster:' | awk '{print $4}')))"
  else
    echo "$(($(lscpu | grep 'Socket(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per socket:' | awk '{print $4}')))"
  fi
 }
 test_inductor_set_cpu_affinity(){
  #set jemalloc
  JEMALLOC_LIB="$(find /usr/lib -name libjemalloc.so.2)"
  export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
@ -840,23 +823,14 @@ test_inductor_set_cpu_affinity(){
    export KMP_AFFINITY=granularity=fine,compact,1,0
    export KMP_BLOCKTIME=1
  fi
-
+  cores=$(test_inductor_get_core_number)
-  # Use nproc here instead of lscpu because it takes into account cgroups slice
+  # Set number of cores to 16 on Aarch64 for performance runs.
  cpus=$(nproc)
  thread_per_core=$(lscpu | grep 'Thread(s) per core:' | awk '{print $4}')
  cores=$((cpus / thread_per_core))
  # Set number of cores to 16 on aarch64 for performance runs
  if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
    cores=16
  fi
  export OMP_NUM_THREADS=$cores
-
+  end_core=$((cores-1))
-  # Handle cgroups slice start and end CPU
+  export TASKSET="taskset -c 0-$end_core"
  start_cpu=$(python -c 'import os; print(min(os.sched_getaffinity(0)))')
  # Leaving one physical CPU for other tasks
  end_cpu=$(($(python -c 'import os; print(max(os.sched_getaffinity(0)))') - thread_per_core))
  export TASKSET="taskset -c $start_cpu-$end_cpu"
 }
 test_inductor_torchbench_cpu_smoketest_perf(){
@ -1502,6 +1476,8 @@ test_executorch() {
  export PYTHON_EXECUTABLE=python
  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
  # For llama3
  bash examples/models/llama3_2_vision/install_requirements.sh
  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
  # from the PR
  bash .ci/scripts/setup-linux.sh --build-tool cmake
@ -1544,7 +1520,7 @@ test_linux_aarch64() {
       inductor/test_inplacing_pass inductor/test_kernel_benchmark inductor/test_layout_optim \
       inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
-       inductor/test_split_cat_fx_passes inductor/test_compile inductor/test_torchinductor \
+       inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes inductor/test_memory \
       inductor/test_triton_cpu_backend inductor/test_triton_extension_backend inductor/test_mkldnn_pattern_matcher inductor/test_cpu_cpp_wrapper \
       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
@ -1647,7 +1623,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    install_torchaudio cuda
  fi
  install_torchvision
-  TORCH_CUDA_ARCH_LIST="8.0;8.6" install_torchao
+  TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install git+https://github.com/pytorch/ao.git
  id=$((SHARD_NUMBER-1))
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
@ -1730,10 +1706,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  test_python
  test_aten
  test_xpu_bin
 elif [[ "${TEST_CONFIG}" == smoke ]]; then
  test_python_smoke
 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
  test_h100_distributed
 else
  install_torchvision
  install_monkeytype
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -37,11 +37,6 @@ call %INSTALLER_DIR%\activate_miniconda3.bat
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 :: Update CMake
 call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=System' --apply-install-arguments-to-dependencies --version=3.27.9
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
@ -93,7 +88,7 @@ set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
 :cuda_build_end
 set DISTUTILS_USE_SDK=1
-set PATH=%TMP_DIR_WIN%\bin;C:\Program Files\CMake\bin;%PATH%
+set PATH=%TMP_DIR_WIN%\bin;%PATH%
 :: The latest Windows CUDA test is running on AWS G5 runner with A10G GPU
 if "%TORCH_CUDA_ARCH_LIST%" == "" set TORCH_CUDA_ARCH_LIST=8.6
--- a/.ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat
@ -24,7 +24,7 @@ if "%CUDA_SUFFIX%" == "" (
 if "%REBUILD%"=="" (
  if "%BUILD_ENVIRONMENT%"=="" (
-    curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z & REM @lint-ignore
+    curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z
  ) else (
    aws s3 cp s3://ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --quiet
  )
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,7 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi
 # TODO: Move both of them to Windows AMI
-python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 pytest-subtests==0.13.1
 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver==4.12.2.0
--- a/.ci/pytorch/windows/arm64/bootstrap_libuv.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_libuv.bat
@ -7,7 +7,7 @@ if not exist "%DOWNLOADS_DIR%" mkdir %DOWNLOADS_DIR%
 if not exist "%DEPENDENCIES_DIR%" mkdir %DEPENDENCIES_DIR%
 :: activate visual studio
-call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
+call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
 where cl.exe
 cd %DEPENDENCIES_DIR%
--- a/.ci/pytorch/windows/arm64/bootstrap_openblas.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_openblas.bat
@ -7,7 +7,7 @@ if not exist "%DOWNLOADS_DIR%" mkdir %DOWNLOADS_DIR%
 if not exist "%DEPENDENCIES_DIR%" mkdir %DEPENDENCIES_DIR%
 :: activate visual studio
-call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
+call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
 where cl.exe
 :: Clone OpenBLAS
--- a/.ci/pytorch/windows/arm64/bootstrap_tests.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_tests.bat
@ -2,7 +2,7 @@
 cd %PYTORCH_ROOT%
 :: activate visual studio
-call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
+call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
 where cl.exe
 :: create virtual environment
--- a/.ci/pytorch/windows/arm64/build_libtorch.bat
+++ b/.ci/pytorch/windows/arm64/build_libtorch.bat
@ -21,7 +21,7 @@ if %ENABLE_APL% == 1 (
 )
 :: activate visual studio
-call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
+call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
 where cl.exe
 :: change to source directory
--- a/.ci/pytorch/windows/arm64/build_pytorch.bat
+++ b/.ci/pytorch/windows/arm64/build_pytorch.bat
@ -21,7 +21,7 @@ if %ENABLE_APL% == 1 (
 )
 :: activate visual studio
-call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
+call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
 where cl.exe
 :: change to source directory
--- a/.ci/pytorch/windows/arm64/smoke_test.bat
+++ b/.ci/pytorch/windows/arm64/smoke_test.bat
@ -33,7 +33,7 @@ pushd tmp
 set VC_VERSION_LOWER=14
 set VC_VERSION_UPPER=36
-call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
+call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
 set install_root=%CD%
 set INCLUDE=%INCLUDE%;%install_root%\include;%install_root%\include\torch\csrc\api\include
--- a/.ci/pytorch/windows/build_pytorch.bat
+++ b/.ci/pytorch/windows/build_pytorch.bat
@ -1,7 +1,7 @@
@echo off
-:: This script parses args, installs required libraries (MKL, Magma, libuv)
+:: This script parses args, installs required libraries (miniconda, MKL,
-:: and then delegates to cpu.bat, cuda80.bat, etc.
+:: Magma), and then delegates to cpu.bat, cuda80.bat, etc.
 if not "%CUDA_VERSION%" == "" if not "%PYTORCH_BUILD_VERSION%" == "" if not "%PYTORCH_BUILD_NUMBER%" == "" goto env_end
 if "%~1"=="" goto arg_error
@ -36,18 +36,28 @@ set DESIRED_PYTHON_PREFIX=py%DESIRED_PYTHON_PREFIX:;=;py%
 set SRC_DIR=%~dp0
 pushd %SRC_DIR%
 :: Install Miniconda3
 set "CONDA_HOME=%CD%\conda"
 set "tmp_conda=%CONDA_HOME%"
 set "miniconda_exe=%CD%\miniconda.exe"
 rmdir /s /q conda
 del miniconda.exe
 curl --retry 3 -k https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Windows-x86_64.exe -o "%miniconda_exe%"
 start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda%
 if ERRORLEVEL 1 exit /b 1
 set "ORIG_PATH=%PATH%"
 set "PATH=%CONDA_HOME%;%CONDA_HOME%\scripts;%CONDA_HOME%\Library\bin;%PATH%"
-:: setup build environment
+:: create a new conda environment and install packages
 :try
 SET /A tries=3
 :loop
 IF %tries% LEQ 0 GOTO :exception
-call setup_build.bat
+call condaenv.bat
 IF %ERRORLEVEL% EQU 0 GOTO :done
 SET /A "tries=%tries%-1"
 :exception
-echo "Failed to setup build environment"
+echo "Failed to create conda env"
 exit /B 1
 :done
@ -63,7 +73,7 @@ if "%DEBUG%" == "1" (
 if not "%CUDA_VERSION%" == "cpu" if not "%CUDA_VERSION%" == "xpu" (
    rmdir /s /q magma_%CUDA_PREFIX%_%BUILD_TYPE%
    del magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z
-    curl -k https://s3.amazonaws.com/ossci-windows/magma_%MAGMA_VERSION%_%CUDA_PREFIX%_%BUILD_TYPE%.7z -o magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z %= @lint-ignore =%
+    curl -k https://s3.amazonaws.com/ossci-windows/magma_%MAGMA_VERSION%_%CUDA_PREFIX%_%BUILD_TYPE%.7z -o magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z
    7z x -aoa magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z -omagma_%CUDA_PREFIX%_%BUILD_TYPE%
 )
@ -97,20 +107,19 @@ set TH_BINARY_BUILD=1
 set INSTALL_TEST=0
 for %%v in (%DESIRED_PYTHON_PREFIX%) do (
-
+    :: Activate Python Environment
-    :: Set Environment vars for the build
+    set PYTHON_PREFIX=%%v
-    set "CMAKE_PREFIX_PATH=%CD%\Python\Library\;%PATH%"
+    set "CONDA_LIB_PATH=%CONDA_HOME%\envs\%%v\Library\bin"
    set "PYTHON_LIB_PATH=%CD%\Python\Library\bin"
    if not "%ADDITIONAL_PATH%" == "" (
-        set "PATH=%ADDITIONAL_PATH%;%PATH%"
+        set "PATH=%ADDITIONAL_PATH%;%CONDA_HOME%\envs\%%v;%CONDA_HOME%\envs\%%v\scripts;%CONDA_HOME%\envs\%%v\Library\bin;%ORIG_PATH%"
    ) else (
        set "PATH=%CONDA_HOME%\envs\%%v;%CONDA_HOME%\envs\%%v\scripts;%CONDA_HOME%\envs\%%v\Library\bin;%ORIG_PATH%"
    )
    pip install ninja
    @setlocal
    :: Set Flags
    if not "%CUDA_VERSION%"=="cpu" if not "%CUDA_VERSION%" == "xpu" (
-        set "MAGMA_HOME=%cd%\magma_%CUDA_PREFIX%_%BUILD_TYPE%"
+        set MAGMA_HOME=%cd%\magma_%CUDA_PREFIX%_%BUILD_TYPE%
    )
    echo "Calling arch build script"
    call %CUDA_PREFIX%.bat
--- a/.ci/pytorch/windows/condaenv.bat
+++ b/.ci/pytorch/windows/condaenv.bat
@ -0,0 +1,27 @@
 IF "%DESIRED_PYTHON%"=="" (
    echo DESIRED_PYTHON is NOT defined.
    exit /b 1
 )
 :: Create a new conda environment
 setlocal EnableDelayedExpansion
 FOR %%v IN (%DESIRED_PYTHON%) DO (
    set PYTHON_VERSION_STR=%%v
    set PYTHON_VERSION_STR=!PYTHON_VERSION_STR:.=!
    conda remove -n py!PYTHON_VERSION_STR! --all -y || rmdir %CONDA_HOME%\envs\py!PYTHON_VERSION_STR! /s
    if "%%v" == "3.9" call conda create -n py!PYTHON_VERSION_STR! -y numpy=2.0.1 boto3 cmake ninja typing_extensions setuptools=72.1.0 python=%%v
    if "%%v" == "3.10" call conda create -n py!PYTHON_VERSION_STR! -y -c=conda-forge numpy=2.0.1  boto3 cmake ninja typing_extensions setuptools=72.1.0 python=%%v
    if "%%v" == "3.11" call conda create -n py!PYTHON_VERSION_STR! -y -c=conda-forge numpy=2.0.1  boto3 cmake ninja typing_extensions setuptools=72.1.0 python=%%v
    if "%%v" == "3.12" call conda create -n py!PYTHON_VERSION_STR! -y -c=conda-forge numpy=2.0.1  boto3 cmake ninja typing_extensions setuptools=72.1.0 python=%%v
    if "%%v" == "3.13" call conda create -n py!PYTHON_VERSION_STR! -y -c=conda-forge numpy=2.1.2  boto3 cmake ninja typing_extensions setuptools=72.1.0 python=%%v
    if "%%v" == "3.13t" call conda create -n py!PYTHON_VERSION_STR! -y -c=conda-forge numpy=2.1.2 boto3 cmake ninja typing_extensions setuptools=72.1.0 python-freethreading python=3.13
    call conda run -n py!PYTHON_VERSION_STR! pip install pyyaml
    call conda run -n py!PYTHON_VERSION_STR! pip install mkl-include
    call conda run -n py!PYTHON_VERSION_STR! pip install mkl-static
 )
 endlocal
 :: Install libuv
 conda install -y -q -c conda-forge libuv=1.39
 set libuv_ROOT=%CONDA_HOME%\Library
 echo libuv_ROOT=%libuv_ROOT%
--- a/.ci/pytorch/windows/cuda124.bat
+++ b/.ci/pytorch/windows/cuda124.bat
@ -37,7 +37,7 @@ IF "%CUDA_PATH_V124%"=="" (
 )
 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0
+    set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0
    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90
--- a/.ci/pytorch/windows/cuda126.bat
+++ b/.ci/pytorch/windows/cuda126.bat
@ -37,7 +37,7 @@ IF "%CUDA_PATH_V126%"=="" (
 )
 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0
+    set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0
    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90
--- a/.ci/pytorch/windows/cuda128.bat
+++ b/.ci/pytorch/windows/cuda128.bat
@ -37,7 +37,7 @@ IF "%CUDA_PATH_V128%"=="" (
 )
 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0
+    set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0
    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
--- a/.ci/pytorch/windows/internal/7z_install.bat
+++ b/.ci/pytorch/windows/internal/7z_install.bat
@ -1,6 +1,6 @@
@echo off
-curl -k -L "https://sourceforge.net/projects/sevenzip/files/7-Zip/18.05/7z1805-x64.exe/download" -o 7z1805-x64.exe
+curl -k https://www.7-zip.org/a/7z1805-x64.exe -O
 if errorlevel 1 exit /b 1
 start /wait 7z1805-x64.exe /S
--- a/.ci/pytorch/windows/internal/clone.bat
+++ b/.ci/pytorch/windows/internal/clone.bat
@ -8,7 +8,7 @@ goto submodule
 :clone_pytorch
-git clone https://github.com/%PYTORCH_REPO%/%MODULE_NAME% & REM @lint-ignore
+git clone https://github.com/%PYTORCH_REPO%/%MODULE_NAME%
 cd %MODULE_NAME%
--- a/.ci/pytorch/windows/internal/copy.bat
+++ b/.ci/pytorch/windows/internal/copy.bat
@ -10,7 +10,7 @@ copy "%CUDA_PATH%\bin\nvrtc*64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib
 copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" pytorch\torch\lib
-copy "%PYTHON_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
+copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
 :: Should be set in build_pytorch.bat
 copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib
--- a/.ci/pytorch/windows/internal/copy_cpu.bat
+++ b/.ci/pytorch/windows/internal/copy_cpu.bat
@ -1,3 +1,3 @@
-copy "%PYTHON_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
+copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
 :: Should be set in build_pytorch.bat
-copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib
+copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Joona Havukainen	918fe1d358	Halves time spent in generating the key strings	2025-04-22 11:32:36 -07:00
Joona Havukainen	2f6940cc55	Adding a direct MPS kernel path to linear op and MPS kernel caching mechanism for improved perf.	2025-04-22 11:32:34 -07:00
`@ -1 +1 @@`
	`b173722085b3f555d6ba4533d6bbaddfd7c71144`	`381ae5d57d35c165d98df728380b20fbde350392`
`@ -1 +1 @@`
	`b0e26b7359c147b8aa0af686c20510fb9b15990a`	`0bcc8265e677e5321606a3311bf71470f14456a8`
`@ -1 +1 @@`
	`c8757738a7418249896224430ce84888e8ecdd79`	`96316ce50fade7e209553aba4898cd9b82aab83b`