[dynamo][export] Add some missing trace rules

ghstack-source-id: 229b3c2854b9401b07ff4a563b4da7687739daf2 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164080
2025-10-26 00:24:53 +08:00 · 2025-09-28 21:46:28 -07:00
348 changed files with 3302 additions and 9008 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -15,8 +15,6 @@ fi
 # Compress the fatbin with -compress-mode=size for CUDA 13
 if [[ "$DESIRED_CUDA" == *"13"* ]]; then
    export TORCH_NVCC_FLAGS="-compress-mode=size"
-    # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
-    export BUILD_BUNDLE_PTXAS=1
 fi

 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -372,7 +372,7 @@ if __name__ == "__main__":
    else:
        print("build pytorch without mkldnn backend")

-    os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
+    os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
    if enable_cuda:
        print("Updating Cuda Dependency")
        filename = os.listdir("/pytorch/dist/")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -442,7 +442,7 @@ def build_torchvision(
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

-    host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
+    host.run_cmd(f"cd vision && {build_vars} python3 setup.py bdist_wheel")
    vision_wheel_name = host.list_dir("vision/dist")[0]
    embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))

@ -497,7 +497,7 @@ def build_torchdata(
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

-    host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
+    host.run_cmd(f"cd data && {build_vars} python3 setup.py bdist_wheel")
    wheel_name = host.list_dir("data/dist")[0]
    embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))

@ -553,7 +553,7 @@ def build_torchtext(
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

-    host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
+    host.run_cmd(f"cd text && {build_vars} python3 setup.py bdist_wheel")
    wheel_name = host.list_dir("text/dist")[0]
    embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))

@ -614,7 +614,7 @@ def build_torchaudio(
    host.run_cmd(
        f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
        && ./packaging/ffmpeg/build.sh \
-        && {build_vars} python3 -m build --wheel --no-isolation"
+        && {build_vars} python3 setup.py bdist_wheel"
    )

    wheel_name = host.list_dir("audio/dist")[0]
@ -726,7 +726,7 @@ def start_build(
    print("Building PyTorch wheel")
    build_opts = ""
    if pytorch_build_number is not None:
-        build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
+        build_opts += f" --build-number {pytorch_build_number}"
    # Breakpad build fails on aarch64
    build_vars = "USE_BREAKPAD=0 "
    if branch == "nightly":
@ -747,8 +747,7 @@ def start_build(
        print("build pytorch with mkldnn+acl backend")
        build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
        host.run_cmd(
-            f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && "
-            f"{build_vars} python3 -m build --wheel --no-isolation{build_opts}"
+            f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}"
        )
        print("Repair the wheel")
        pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
@ -764,7 +763,7 @@ def start_build(
    else:
        print("build pytorch without mkldnn backend")
        host.run_cmd(
-            f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
+            f"cd pytorch && {build_vars} python3 setup.py bdist_wheel{build_opts}"
        )

    print("Deleting build folder")
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -84,8 +84,8 @@ fi
 _UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
 _UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
 if [[ "$image" == *rocm* ]]; then
-  _UCX_COMMIT=29831d319e6be55cb8c768ca61de335c934ca39e
-  _UCC_COMMIT=9f4b242cbbd8b1462cbc732eb29316cdfa124b77
+  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
+  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
 fi

 tag=$(echo $image | awk -F':' '{print $2}')
@ -175,6 +175,20 @@ case "$tag" in
    fi
    GCC_VERSION=11
    VISION=yes
+    ROCM_VERSION=6.4
+    NINJA_VERSION=1.9.0
+    TRITON=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    if [[ $tag =~ "benchmarks" ]]; then
+      INDUCTOR_BENCHMARKS=yes
+    fi
+    ;;
+  pytorch-linux-noble-rocm-alpha-py3)
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=11
+    VISION=yes
    ROCM_VERSION=7.0
    NINJA_VERSION=1.9.0
    TRITON=yes
@ -182,9 +196,6 @@ case "$tag" in
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
-    if [[ $tag =~ "benchmarks" ]]; then
-      INDUCTOR_BENCHMARKS=yes
-    fi
    ;;
  pytorch-linux-jammy-xpu-n-1-py3)
    ANACONDA_PYTHON_VERSION=3.10
@ -441,3 +452,12 @@ elif [ "$HAS_TRITON" = "yes" ]; then
  echo "expecting triton to not be installed, but it is"
  exit 1
 fi
+
+# Sanity check cmake version.  Executorch reinstalls cmake and I'm not sure if
+# they support 4.0.0 yet, so exclude them from this check.
+CMAKE_VERSION=$(drun cmake --version)
+if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
+  echo "CMake version is not 4.0.0:"
+  drun cmake --version
+  exit 1
+fi
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -42,6 +42,12 @@ EOF
    rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
    amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"

+    # Special case for ROCM_VERSION == 7.0
+    if [[ $(ver "$ROCM_VERSION") -eq $(ver 7.0) ]]; then
+        rocm_baseurl="https://repo.radeon.com/rocm/apt/7.0_alpha2"
+        amdgpu_baseurl="https://repo.radeon.com/amdgpu/30.10_alpha2/ubuntu"
+    fi
+
    # Add amdgpu repository
    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
    echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -66,15 +66,15 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9

-  CXX=g++-9 conda_run python -m build --wheel --no-isolation
+  CXX=g++-9 conda_run python setup.py bdist_wheel
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9

-  CXX=g++-9 conda_run python -m build --wheel --no-isolation
+  CXX=g++-9 conda_run python setup.py bdist_wheel
 else
-  conda_run python -m build --wheel --no-isolation
+  conda_run python setup.py bdist_wheel
 fi

 # Copy the wheel to /opt for multi stage docker builds
--- a/.ci/docker/manywheel/Dockerfile_cxx11-abi
+++ b/.ci/docker/manywheel/Dockerfile_cxx11-abi
@ -0,0 +1,71 @@
+FROM centos:8 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV PATH /opt/rh/gcc-toolset-11/root/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+
+# change to a valid repo
+RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Linux-*.repo
+# enable to install ninja-build
+RUN sed -i 's|enabled=0|enabled=1|g' /etc/yum.repos.d/CentOS-Linux-PowerTools.repo
+
+RUN yum -y update
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which zlib-devel sudo
+RUN yum install -y autoconf automake make cmake gdb gcc-toolset-11-gcc-c++
+
+
+FROM base as openssl
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# Install python
+FROM base as python
+RUN yum install -y openssl-devel zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel
+ADD common/install_cpython.sh install_cpython.sh
+RUN bash ./install_cpython.sh && rm install_cpython.sh
+
+FROM base as conda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+RUN /opt/conda/bin/conda install -y cmake
+
+FROM base as intel
+# Install MKL
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=conda              /opt/conda                            /opt/conda
+ENV PATH=/opt/conda/bin:$PATH
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM base as jni
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM base as final
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=conda              /opt/conda                            /opt/conda
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+
+RUN yum install -y ninja-build
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -43,6 +43,12 @@ case ${image} in
        MANY_LINUX_VERSION="2_28_aarch64"
        OPENBLAS_VERSION="v0.3.30"
        ;;
+    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
+        TARGET=final
+        GPU_IMAGE=""
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
+        MANY_LINUX_VERSION="cxx11-abi"
+        ;;
    manylinuxs390x-builder:cpu-s390x)
        TARGET=final
        GPU_IMAGE=s390x/almalinux:8
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -10,11 +10,6 @@ boto3==1.35.42
 #Pinned versions: 1.19.12, 1.16.34
 #test that import:

-build==1.3.0
-#Description: A simple, correct Python build frontend.
-#Pinned versions: 1.3.0
-#test that import:
-
 click
 #Description: Command Line Interface Creation Kit
 #Pinned versions:
@ -111,10 +106,10 @@ networkx==2.8.8
 #Pinned versions: 2.8.8
 #test that import: functorch

-ninja==1.11.1.4
+ninja==1.11.1.3
 #Description: build system. Used in some tests. Used in build to generate build
 #time tracing information
-#Pinned versions: 1.11.1.4
+#Pinned versions: 1.11.1.3
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

 numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
@ -172,9 +167,9 @@ pillow==11.0.0
 #Pinned versions: 10.3.0
 #test that import:

-protobuf==5.29.5
+protobuf==5.29.4
 #Description:  Google's data interchange format
-#Pinned versions: 5.29.5
+#Pinned versions: 5.29.4
 #test that import: test_tensorboard.py, test/onnx/*

 psutil
@ -378,7 +373,7 @@ dataclasses_json==0.6.7
 #Pinned versions: 0.6.7
 #test that import:

-cmake==3.31.6
+cmake==4.0.0
 #Description: required for building

 tlparse==0.4.0
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -9,7 +9,7 @@ standard-imghdr==3.13.0; python_version >= "3.13"
 # 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
 # Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.

-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -142,7 +142,7 @@ time CMAKE_ARGS=${CMAKE_ARGS[@]} \
    EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
-    python -m build --wheel --no-isolation --outdir /tmp/$WHEELHOUSE_DIR
+    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
 echo "Finished setup.py bdist at $(date)"

 # Build libtorch packages
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -104,7 +104,7 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
 fi

-echo "Calling -m pip install . -v --no-build-isolation at $(date)"
+echo "Calling 'python -m pip install .' at $(date)"

 if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
    STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -290,13 +290,13 @@ else

      WERROR=1 python setup.py clean

-      WERROR=1 python -m build --wheel --no-isolation
+      WERROR=1 python setup.py bdist_wheel
    else
      python setup.py clean
      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
        source .ci/pytorch/install_cache_xla.sh
      fi
-      python -m build --wheel --no-isolation
+      python setup.py bdist_wheel
    fi
    pip_install_whl "$(echo dist/*.whl)"

--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -36,11 +36,11 @@ fi
 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
-  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python -m build --wheel --no-isolation
+  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
-  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python -m build --wheel --no-isolation -C--build-option=--plat-name=macosx_11_0_arm64
+  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
  print_sccache_stats
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -26,7 +26,6 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
    time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
    time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
    time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
-    time python test/run_test.py --verbose -i distributed/test_aten_comm_compute_reordering
    time python test/run_test.py --verbose -i distributed/test_store
    time python test/run_test.py --verbose -i distributed/test_symmetric_memory
    time python test/run_test.py --verbose -i distributed/test_pg_wrapper
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -435,7 +435,7 @@ test_inductor_distributed() {

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
  # with if required # gpus aren't available
-  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_aten_comm_compute_reordering distributed/test_compute_comm_reordering --verbose
+  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_compute_comm_reordering --verbose
  assert_git_not_dirty
 }

@ -1415,7 +1415,7 @@ EOF
  pip3 install -r requirements.txt
  # shellcheck source=./common-build.sh
  source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
-  python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist"
+  python setup.py bdist_wheel --bdist-dir="base_bdist_tmp" --dist-dir="base_dist"
  python -mpip install base_dist/*.whl
  echo "::endgroup::"

@ -1617,7 +1617,7 @@ test_operator_benchmark() {
  test_inductor_set_cpu_affinity

  cd benchmarks/operator_benchmark/pt_extension
-  python -m pip install . -v --no-build-isolation
+  python -m pip install .

  cd "${TEST_DIR}"/benchmarks/operator_benchmark
  $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
--- a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
+++ b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
@ -70,7 +70,7 @@ sccache --zero-stats
 sccache --show-stats

 # Build the wheel
-python -m build --wheel --no-build-isolation
+python setup.py bdist_wheel
 if ($LASTEXITCODE -ne 0) { exit 1 }

 # Install the wheel locally
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -130,7 +130,7 @@ if "%USE_CUDA%"=="1" (
 :: Print all existing environment variable for debugging
 set

-python -m build --wheel --no-isolation
+python setup.py bdist_wheel
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 sccache --show-stats
--- a/.ci/pytorch/windows/arm64/build_pytorch.bat
+++ b/.ci/pytorch/windows/arm64/build_pytorch.bat
@ -48,7 +48,7 @@ sccache --zero-stats
 sccache --show-stats

 :: Call PyTorch build script
-python -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+python setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"

 :: show sccache stats
 sccache --show-stats
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@ -28,5 +28,5 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t
 if errorlevel 1 exit /b 1

 set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%"
-%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel build
+%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel
 if errorlevel 1 exit /b 1
--- a/.ci/pytorch/windows/internal/setup.bat
+++ b/.ci/pytorch/windows/internal/setup.bat
@ -86,7 +86,7 @@ copy /Y "%LIBTORCH_PREFIX%-%PYTORCH_BUILD_VERSION%.zip" "%PYTORCH_FINAL_PACKAGE_
 goto build_end

 :pytorch
-%PYTHON_EXEC% -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+%PYTHON_EXEC% setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"

 :build_end
 IF ERRORLEVEL 1 exit /b 1
--- a/.ci/pytorch/windows/setup_build.bat
+++ b/.ci/pytorch/windows/setup_build.bat
@ -18,7 +18,7 @@ if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake

 %PYTHON_EXEC% -m pip install pyyaml
 %PYTHON_EXEC% -m pip install mkl-include mkl-static
-%PYTHON_EXEC% -m pip install boto3 requests ninja typing_extensions setuptools==72.1.0
+%PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0

 where cmake.exe

--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -143,8 +143,7 @@ case $desired_python in
        RENAME_WHEEL=false
        ;;
    3.13t)
-        echo "Using 3.13t deps"
-        mac_version='macosx-11.0-arm64'
+        echo "Using 3.13 deps"
        NUMPY_PINNED_VERSION="==2.1.0"
        RENAME_WHEEL=false
        ;;
@ -186,11 +185,11 @@ export USE_QNNPACK=OFF
 export BUILD_TEST=OFF

 pushd "$pytorch_rootdir"
-echo "Calling -m build --wheel --no-isolation at $(date)"
+echo "Calling setup.py bdist_wheel at $(date)"

-_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python -m build --wheel --no-isolation --outdir "$whl_tmp_dir" -C--plat-name="${mac_version//[-.]/_}"
+_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name "${mac_version//[-.]/_}"

-echo "Finished -m build --wheel --no-isolation at $(date)"
+echo "Finished setup.py bdist_wheel at $(date)"

 if [[ $package_type != 'libtorch' ]]; then
    echo "delocating wheel dependencies"
--- a/.github/ISSUE_TEMPLATE/ci-sev.md
+++ b/.github/ISSUE_TEMPLATE/ci-sev.md
@ -1,10 +1,6 @@
 ---
 name: "⚠️ CI SEV"
 about: Tracking incidents for PyTorch's CI infra.
-title: ''
-labels: ''
-assignees: ''
-
 ---

 > NOTE: Remember to label this issue with "`ci: sev`"
--- a/.github/ISSUE_TEMPLATE/disable-autorevert.md
+++ b/.github/ISSUE_TEMPLATE/disable-autorevert.md
@ -1,18 +0,0 @@
---
-name: DISABLE AUTOREVERT
-about: Disables autorevert when open
-title: "❌\U0001F519 [DISABLE AUTOREVERT]"
-labels: 'ci: disable-autorevert'
-assignees: ''
-
---
-
-This issue, while open, disables the autorevert functionality.
-
-More details can be found [here](https://github.com/pytorch/test-infra/blob/main/aws/lambda/pytorch-auto-revert/README.md)
-
-
-## Why are you disabling autorevert?
-
-
-## Links to any issues/commits/errors that shows the source of problem
--- a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
+++ b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
@ -1,10 +1,8 @@
 ---
 name: Disable CI jobs (PyTorch Dev Infra only)
 about: Use this template to disable CI jobs
-title: DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]
-labels: 'module: ci'
-assignees: ''
-
+title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
+labels: "module: ci"
 ---

 > For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-78a47f87ce259a48f0391fa9ae15add05ea7432b
+0307428d65acf5cf1a73a70a7722e076bbb83f22
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-0fc62aa26a30ed7ca419d285f285cb5ba02c4394
+c77852e117bdf056c8e9a087e51d6f65cf6ba53d
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -1,5 +1,4 @@
 boto3==1.35.42
-build==1.2.2.post1
 cmake==3.27.*
 expecttest==0.3.0
 fbscribelogger==0.1.7
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -127,6 +127,53 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
    ),
 ]

+ROCM_SMOKE_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="manywheel",
+        build_variant="rocm",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.LINUX,
+            arches=["6.4"],
+            python_versions=["3.10"],
+        ),
+        ciflow_config=CIFlowConfig(
+            labels={
+                LABEL_CIFLOW_BINARIES,
+                LABEL_CIFLOW_BINARIES_WHEEL,
+                LABEL_CIFLOW_ROCM,
+            },
+            isolated_workflow=True,
+        ),
+        branches="main",
+    ),
+]
+
+LINUX_BINARY_SMOKE_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="manywheel",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.LINUX,
+            arches=["13.0"],
+            python_versions=["3.12"],
+        ),
+        branches="main",
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.RELEASE,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.LINUX,
+            generate_binary_build_matrix.RELEASE,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+    ),
+]
+
 WINDOWS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
@ -212,6 +259,39 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
    ),
 ]

+WINDOWS_BINARY_SMOKE_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.RELEASE,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS,
+            generate_binary_build_matrix.RELEASE,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+        ciflow_config=CIFlowConfig(
+            isolated_workflow=True,
+        ),
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.DEBUG,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS,
+            generate_binary_build_matrix.DEBUG,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+        ciflow_config=CIFlowConfig(
+            isolated_workflow=True,
+        ),
+    ),
+]
+
 MACOS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.MACOS_ARM64,
@ -292,10 +372,23 @@ def main() -> None:
            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
            S390X_BINARY_BUILD_WORKFLOWS,
        ),
+        (
+            # Give rocm it's own workflow file
+            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
+            ROCM_SMOKE_WORKFLOWS,
+        ),
+        (
+            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
+            LINUX_BINARY_SMOKE_WORKFLOWS,
+        ),
        (
            jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
            WINDOWS_BINARY_BUILD_WORKFLOWS,
        ),
+        (
+            jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
+            WINDOWS_BINARY_SMOKE_WORKFLOWS,
+        ),
        (
            jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
            MACOS_BINARY_BUILD_WORKFLOWS,
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -36,7 +36,7 @@ jobs:
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
-        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "rocm7.0", "cpu"]
    steps:
      - name: Build docker image
        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -56,6 +56,7 @@ jobs:
          { name: "manylinux2_28-builder",          tag: "rocm7.0",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinuxcxx11-abi-builder",     tag: "cpu-cxx11-abi",     runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "xpu",               runner: "linux.9xlarge.ephemeral" },
        ]
    runs-on: ${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -59,6 +59,7 @@ jobs:
          pytorch-linux-jammy-py3.13-clang12,
          pytorch-linux-jammy-rocm-n-py3,
          pytorch-linux-noble-rocm-n-py3,
+          pytorch-linux-noble-rocm-alpha-py3,
          pytorch-linux-jammy-rocm-n-py3-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
          pytorch-linux-jammy-py3.10-gcc11,
--- a/.github/workflows/generated-linux-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-release-main.yml
@ -0,0 +1,87 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-libtorch-release
+
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - 'ciflow/trunk/*'
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-libtorch-release
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 0
+concurrency:
+  group: linux-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+  libtorch-cpu-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-cpu-shared-with-deps-release
+      build_environment: linux-binary-libtorch-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cpu-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cpu-shared-with-deps-release-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cpu-shared-with-deps-release
+      build_environment: linux-binary-libtorch-release
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -0,0 +1,88 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-manywheel
+
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - 'ciflow/trunk/*'
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-manywheel
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 0
+concurrency:
+  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+  manywheel-py3_12-cuda13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_12-cuda13_0
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda13_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_12-cuda13_0-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda13_0
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@ -0,0 +1,136 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-manywheel-rocm
+
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_wheel/*'
+      - 'ciflow/rocm/*'
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-manywheel-rocm
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 0
+concurrency:
+  group: linux-binary-manywheel-rocm-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+  manywheel-py3_10-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
+      build_name: manywheel-py3_10-rocm6_4
+      build_environment: linux-binary-manywheel-rocm
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-rocm6_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_10-rocm6_4-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_10-rocm6_4
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.4
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@ -0,0 +1,261 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: windows-binary-libtorch-debug
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: windows-binary-libtorch-debug
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+  OS: windows
+concurrency:
+  group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+  libtorch-cpu-shared-with-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+    timeout-minutes: 360
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.10"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: libtorch-cpu-shared-with-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  libtorch-cpu-shared-with-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cpu-shared-with-deps-debug-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+    timeout-minutes: 360
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@ -0,0 +1,261 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: windows-binary-libtorch-release
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: windows-binary-libtorch-release
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+  OS: windows
+concurrency:
+  group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+  libtorch-cpu-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+    timeout-minutes: 360
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.10"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: libtorch-cpu-shared-with-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  libtorch-cpu-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cpu-shared-with-deps-release-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+    timeout-minutes: 360
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -59,14 +59,13 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.4-py3.10-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
-      cuda-arch-list: 7.5
      test-matrix: |
        { include: [
-          { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit

@ -113,13 +112,13 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-build:
-    name: linux-jammy-cuda12.8-py3.10-gcc9
+  linux-jammy-cuda12_8-py3_9-gcc9-build:
+    name: linux-jammy-cuda12.8-py3.9-gcc9
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9
+      build-environment: linux-jammy-cuda12.8-py3.9-gcc9
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
      cuda-arch-list: 8.6
      test-matrix: |
@ -129,14 +128,14 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-test:
-    name: linux-jammy-cuda12.8-py3.10-gcc9
+  linux-jammy-cuda12_8-py3_9-gcc9-test:
+    name: linux-jammy-cuda12.8-py3.9-gcc9
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-build
+    needs: linux-jammy-cuda12_8-py3_9-gcc9-build
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.8-py3.9-gcc9
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-cuda12_8-py3_10-gcc9-debug-build:
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -343,14 +343,14 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-xpu-n-py3_10-build:
-    name: linux-jammy-xpu-n-py3.10
+  linux-jammy-xpu-n-py3_9-build:
+    name: linux-jammy-xpu-n-py3.9
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      sync-tag: linux-xpu-n-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-py3.10
+      build-environment: linux-jammy-xpu-n-py3.9
      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
      test-matrix: |
        { include: [
--- a/.github/workflows/rocm-mi355.yml
+++ b/.github/workflows/rocm-mi355.yml
@ -38,7 +38,7 @@ jobs:
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-noble-rocm-py3.12-mi355
-      docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
+      docker-image-name: ci-image:pytorch-linux-noble-rocm-alpha-py3
      sync-tag: rocm-build
      test-matrix: |
        { include: [
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -888,28 +888,23 @@ cmake_dependent_option(
  "(USE_CUDA AND NOT MSVC) OR USE_ROCM"
  OFF)

-
-IF(USE_ROCM AND "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
-  message(WARNING "Setting USE_FBGEMM_GENAI for gfx942 to ON by default, doing ROCM build")
-  set(USE_FBGEMM_GENAI_DEFAULT ON)
-elseif(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
-  message(STATUS "Setting USE_FBGEMM_GENAI to ON by default , doing CUDA build for SM100a")
-  set(USE_FBGEMM_GENAI_DEFAULT ON)
-else()
-  set(USE_FBGEMM_GENAI_DEFAULT OFF)
-endif()
-
 cmake_dependent_option(
  USE_FBGEMM_GENAI
  "Whether to build FBGEMM GenAI quantized GEMM kernels.\
  Will be disabled if not supported by the platform"
-  ${USE_FBGEMM_GENAI_DEFAULT}
-  "(USE_CUDA AND NOT MSVC) OR USE_ROCM"
+  ON
+  "USE_ROCM"
  OFF)

+IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
+  message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
+  set(USE_FBGEMM_GENAI off)
+endif()

 # Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
 if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
+  message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
+  set(USE_FBGEMM_GENAI ON)
 endif()

 # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@ -103,9 +103,7 @@ std::string get_cpu_capability() {
 #elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
    case native::CPUCapability::ZVECTOR:
      return "Z VECTOR";
-#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
-    case native::CPUCapability::SVE128:
-      return "SVE128";
+#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
    case native::CPUCapability::SVE256:
      return "SVE256";
 #else
--- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp
@ -1,32 +1,22 @@
 #include <ATen/core/PythonOpRegistrationTrampoline.h>
+#include <c10/core/impl/PyInterpreterHooks.h>

+// TODO: delete this
 namespace at::impl {

-// The strategy is that all python interpreters attempt to register themselves
-// as the main interpreter, but only one wins.  Only that interpreter is
-// allowed to interact with the C++ dispatcher.  Furthermore, when we execute
-// logic on that interpreter, we do so hermetically, never setting pyobj field
-// on Tensor.
-
-std::atomic<c10::impl::PyInterpreter*>
-    PythonOpRegistrationTrampoline::interpreter_{nullptr};
+c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::interpreter_ = nullptr;

 c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() {
-  return PythonOpRegistrationTrampoline::interpreter_.load();
+  return c10::impl::getGlobalPyInterpreter();
 }

 bool PythonOpRegistrationTrampoline::registerInterpreter(
    c10::impl::PyInterpreter* interp) {
-  c10::impl::PyInterpreter* expected = nullptr;
-  interpreter_.compare_exchange_strong(expected, interp);
-  if (expected != nullptr) {
-    // This is the second (or later) Python interpreter, which means we need
-    // non-trivial hermetic PyObject TLS
-    c10::impl::HermeticPyObjectTLS::init_state();
+  if (interpreter_ != nullptr) {
    return false;
-  } else {
-    return true;
  }
+  interpreter_ = interp;
+  return true;
 }

 } // namespace at::impl
--- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
@ -2,19 +2,21 @@

 #include <ATen/core/dispatch/Dispatcher.h>

-// TODO: this can probably live in c10
+// TODO: We can get rid of this


 namespace at::impl {

+// Manages the single Python interpreter instance for PyTorch.
 class TORCH_API PythonOpRegistrationTrampoline final {
-  static std::atomic<c10::impl::PyInterpreter*> interpreter_;
+  static c10::impl::PyInterpreter* interpreter_;

 public:
-  //  Returns true if you successfully registered yourself (that means
-  //  you are in the hot seat for doing the operator registrations!)
+  // Register the Python interpreter. Returns true on first registration,
+  // false if an interpreter was already registered.
  static bool registerInterpreter(c10::impl::PyInterpreter*);

+  // Returns the registered interpreter via the global PyInterpreter hooks.
  // Returns nullptr if no interpreter has been registered yet.
  static c10::impl::PyInterpreter* getInterpreter();
 };
--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@ -102,31 +102,8 @@ struct VecReduceAllSIMD<float, Op> {
 #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
       // !defined(C10_MOBILE)

-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
-#if defined(CPU_CAPABILITY_SVE256)
-template <typename Op>
-struct VecReduceAllSIMD<float, Op> {
-  static inline float apply(
-      const Op& vec_fun,
-      const Vectorized<float>& acc_vec) {
-    using Vec = Vectorized<float>;
-    Vec v = acc_vec;
-    // 128-bit shuffle
-    svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
-    Vec v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    // 64-bit shuffle
-    ind = svdupq_n_u32(2, 3, 0, 1);
-    v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    // 32-bit shuffle
-    ind = svdupq_n_u32(1, 0, 2, 3);
-    v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    return svlasta(svpfalse(), v);
-  }
-};
-#else
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
+    !defined(CPU_CAPABILITY_SVE)
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
  static inline float apply(
@ -163,8 +140,35 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
    return vaddvq_f32(acc_vec);
  }
 };
-#endif // defined(CPU_CAPABILITY_SVE256)
 #endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+       // && !defined(CPU_CAPABILITY_SVE)
+
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
+    defined(CPU_CAPABILITY_SVE256)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(
+      const Op& vec_fun,
+      const Vectorized<float>& acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+    // 128-bit shuffle
+    svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
+    Vec v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    // 64-bit shuffle
+    ind = svdupq_n_u32(2, 3, 0, 1);
+    v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    // 32-bit shuffle
+    ind = svdupq_n_u32(1, 0, 2, 3);
+    v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    return svlasta(svpfalse(), v);
+  }
+};
+#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+       // && defined(CPU_CAPABILITY_SVE256)

 template <typename scalar_t, typename Op>
 inline scalar_t vec_reduce_all(
--- a/aten/src/ATen/cpu/vec/sve/sve_helper.h
+++ b/aten/src/ATen/cpu/vec/sve/sve_helper.h
@ -1,21 +1,9 @@
 #pragma once

 #include <ATen/cpu/vec/intrinsics.h>
-#include <c10/macros/Macros.h>
-#include <cstdint>

 #include <ATen/cpu/vec/vec_base.h>

-#if defined(__aarch64__) &&                     \
-    (defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) || \
-     defined(AT_BUILD_ARM_VECSVE_WITH_SLEEF))
-#define SLEEF_STATIC_LIBS
-#include <sleef.h>
-#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
-#else
-#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
-#endif
-
 #if defined(CPU_CAPABILITY_SVE)

 // Define the data type of VLS(vector-length specific).
--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
@ -2,6 +2,7 @@

 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/sve/sve_helper.h>
+#include <ATen/cpu/vec/sve/vec_common_sve.h>
 #include <ATen/cpu/vec/sve/vec_float.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/bit_cast.h>
--- a/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
@ -8,48 +8,13 @@
 #include <ATen/cpu/vec/sve/sve_helper.h>
 #include <ATen/cpu/vec/vec_base.h>

-#ifdef CPU_CAPABILITY_SVE128
-
-#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
-
-#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
-
-#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
-
-#include <ATen/cpu/vec/vec128/vec128_convert.h>
-
-#include <ATen/cpu/vec/sve/vec_qint.h>
-
-#elif defined(CPU_CAPABILITY_SVE)
-
-#include <ATen/cpu/vec/sve/vec_float.h>
-
+#if defined(CPU_CAPABILITY_SVE)
 #include <ATen/cpu/vec/sve/vec_bfloat16.h>
-
 #include <ATen/cpu/vec/sve/vec_double.h>
+#include <ATen/cpu/vec/sve/vec_float.h>
 #include <ATen/cpu/vec/sve/vec_int.h>
-
 #include <ATen/cpu/vec/sve/vec_qint.h>
-
-#include <ATen/cpu/vec/vec256/vec256_half.h>
-
-#include <ATen/cpu/vec/vec256/vec256_convert.h>
-
-#else // NEON
-
-#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
-
-#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
-
-#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
-
-#include <ATen/cpu/vec/vec128/vec128_convert.h>
-
-#include <ATen/cpu/vec/vec256/vec256_qint.h>
-
-#endif // defined(CPU_CAPABILITY_SVE128)
-
-#include <ATen/cpu/vec/functional.h>
+#endif

 namespace at::vec {
 // Note [CPU_CAPABILITY namespace]
@ -83,6 +48,12 @@ DEFINE_SVE_CAST(int32_t, s32, float, f32)
 DEFINE_SVE_CAST(int16_t, s16, float, f32)
 DEFINE_SVE_CAST(float, f32, double, f64)

+#ifdef __ARM_FEATURE_BF16
+DEFINE_SVE_CAST(int64_t, s64, c10::BFloat16, bf16)
+DEFINE_SVE_CAST(int32_t, s32, c10::BFloat16, bf16)
+DEFINE_SVE_CAST(int16_t, s16, c10::BFloat16, bf16)
+#endif // __ARM_FEATURE_BF16
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 template <int64_t scale = 1>
@ -202,11 +173,9 @@ std::pair<
  // group cols crossing lanes:
  //   return {a0, b0, a1, b1, a2, b2, a3, b3}
  //          {a4, b4, a5, b5, a6, b6, a7, b7}
-  svbfloat16_t aReg = a;
-  svbfloat16_t bReg = b;
-  Vectorized<c10::BFloat16> c = svzip1_bf16(aReg, bReg);
-  Vectorized<c10::BFloat16> d = svzip2_bf16(aReg, bReg);
-  return std::make_pair(c, d);
+  return std::make_pair(
+      Vectorized<c10::BFloat16>(svzip1_bf16(a, b)),
+      Vectorized<c10::BFloat16>(svzip2_bf16(a, b)));
 }
 #endif // __ARM_FEATURE_BF16

@ -255,27 +224,12 @@ std::pair<
  // swap lanes:
  //   return {a0, a1, a2, a3, a4, a5, a6, a7}
  //          {b0, b1, b2, b3, b4, b5, b6, b7}
-  svbfloat16_t aReg = a;
-  svbfloat16_t bReg = b;
-  Vectorized<c10::BFloat16> c = svuzp1_bf16(aReg, bReg);
-  Vectorized<c10::BFloat16> d = svuzp2_bf16(aReg, bReg);
-  return std::make_pair(c, d);
+  return std::make_pair(
+      Vectorized<c10::BFloat16>(svuzp1_bf16((svbfloat16_t)a, (svbfloat16_t)b)),
+      Vectorized<c10::BFloat16>(svuzp2_bf16((svbfloat16_t)a, (svbfloat16_t)b)));
 }
 #endif // __ARM_FEATURE_BF16

-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#define DEFINE_FLIP_FUNC(type, sve_func)                    \
-  inline Vectorized<type> flip(const Vectorized<type>& v) { \
-    return Vectorized<type>(sve_func(v));                   \
-  }
-// Use the macro to define the flip functions
-DEFINE_FLIP_FUNC(float, svrev_f32)
-DEFINE_FLIP_FUNC(double, svrev_f64)
-DEFINE_FLIP_FUNC(int64_t, svrev_s64)
-DEFINE_FLIP_FUNC(int32_t, svrev_s32)
-DEFINE_FLIP_FUNC(int16_t, svrev_s16)
-DEFINE_FLIP_FUNC(int8_t, svrev_s8)
-
 #endif // defined(CPU_CAPABILITY_SVE)

 } // namespace CPU_CAPABILITY
--- a/aten/src/ATen/cpu/vec/vec.h
+++ b/aten/src/ATen/cpu/vec/vec.h
@ -1,8 +1,6 @@
 #pragma once

-#if defined(__aarch64__)
-#include <ATen/cpu/vec/vec_common_aarch64.h>
-#elif defined(CPU_CAPABILITY_AVX512)
+#if defined(CPU_CAPABILITY_AVX512)
 #include <ATen/cpu/vec/vec512/vec512.h>
 #else
 #include <ATen/cpu/vec/vec128/vec128.h>
@ -13,34 +11,6 @@ namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

-inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
-  stream << val.val_;
-  return stream;
-}
-inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
-  stream << static_cast<int>(val.val_);
-  return stream;
-}
-inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
-  stream << static_cast<unsigned int>(val.val_);
-  return stream;
-}
-
-template <typename T>
-std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
-  T buf[Vectorized<T>::size()];
-  vec.store(buf);
-  stream << "vec[";
-  for (int i = 0; i != Vectorized<T>::size(); i++) {
-    if (i != 0) {
-      stream << ", ";
-    }
-    stream << buf[i];
-  }
-  stream << "]";
-  return stream;
-}
-
 inline Vectorized<bool> convert_to_bool(Vectorized<int8_t> x) {
  __at_align__ bool buffer[x.size()];
  x.ne(Vectorized<int8_t>(0)).store(buffer);
--- a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
@ -2,7 +2,6 @@

 // DO NOT DEFINE STATIC DATA IN THIS HEADER!
 // See Note [Do not compile initializers with AVX]
-#include <ATen/cpu/vec/sve/sve_helper.h>
 #include <ATen/cpu/vec/vec128/vec128_float_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
 #include <ATen/cpu/vec/vec_base.h>
@ -263,13 +262,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
            c10::bit_cast<at_bfloat16_t>(val6.x),
            c10::bit_cast<at_bfloat16_t>(val7.x)}) {}

-#ifdef CPU_CAPABILITY_SVE128
-  Vectorized(svbfloat16_t v) : Vectorized16(svget_neonq(v)) {}
-  operator svbfloat16_t() const {
-    return svset_neonq(svundef_bf16(), values);
-  }
-#endif
-
  static Vectorized<c10::BFloat16> blendv(
      const Vectorized<c10::BFloat16>& a,
      const Vectorized<c10::BFloat16>& b,
@ -382,23 +374,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
  Vectorized ge(const Vectorized& other) const;
  Vectorized lt(const Vectorized& other) const;
  Vectorized le(const Vectorized& other) const;
-
-#ifdef CPU_CAPABILITY_SVE128
-
-  template <typename step_t>
-  static Vectorized<BFloat16> arange(
-      BFloat16 base = 0.f,
-      step_t step = static_cast<step_t>(1)) {
-    __at_align__ BFloat16 buffer[size()];
-    for (int64_t i = 0; i < size(); i++) {
-      buffer[i] = base + i * step;
-    }
-    return svget_neonq(
-        svld1_bf16(ptrue, reinterpret_cast<bfloat16_t*>(buffer)));
-  }
-
-#endif // CPU_CAPABILITY_SVE128
-
 }; // Vectorized<c10::BFloat16>

 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
@ -422,24 +397,6 @@ inline Vectorized<c10::BFloat16> convert_float_bfloat16(
  return Vectorized<c10::BFloat16>(at_vcombine_bf16(x1, x2));
 }

-inline void load_fp32_from_bf16(const BFloat16* data, Vectorized<float>& out) {
-  __at_align__ float values[Vectorized<float>::size()];
-  for (const auto k : c10::irange(Vectorized<float>::size())) {
-    values[k] = data[k];
-  }
-  out = Vectorized<float>::loadu(values);
-}
-
-inline void load_fp32_from_bf16(
-    const BFloat16* data,
-    Vectorized<float>& out1,
-    Vectorized<float>& out2) {
-  Vectorized<BFloat16> bf16_vec = Vectorized<BFloat16>::loadu(data);
-  auto floats = convert_bfloat16_float(bf16_vec);
-  out1 = std::get<0>(floats);
-  out2 = std::get<1>(floats);
-}
-
 template <typename Op>
 Vectorized<c10::BFloat16> binary_operator_via_float(
    Op op,
@ -622,12 +579,6 @@ Vectorized<c10::BFloat16> inline fnmsub(
  return -a * b - c;
 }

-#else //
-
-CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
-
-LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
-
 #endif // !defined(C10_MOBILE) && defined(__aarch64__)

 } // namespace CPU_CAPABILITY
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@ -4,7 +4,7 @@

 namespace at::vec {
 inline namespace CPU_CAPABILITY {
-#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
+#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
 template <typename src_t>
 struct VecConvert<
    float,
@ -60,7 +60,6 @@ struct VecConvert<float, 1, BFloat16, 1> {
  }
 };

-#endif // defined(__aarch64__) && (!defined(CPU_CAPABILITY_SVE) ||
-       // defined(CPU_CAPABILITY_SVE128))
+#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@ -4,10 +4,13 @@
 // See Note [Do not compile initializers with AVX]

 #include <ATen/cpu/vec/intrinsics.h>
-#include <ATen/cpu/vec/sve/sve_helper.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>

+#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#include <sleef.h>
+#endif
+
 // Sleef offers vectorized versions of some transcedentals
 // such as sin, cos, tan etc..
 // However for now opting for STL, since we are not building
@ -32,6 +35,12 @@ inline namespace CPU_CAPABILITY {
 #error "Big endian is not supported."
 #endif

+#if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+
 template <int index, bool mask_val>
 struct BlendRegs {
  static float32x4_t impl(
@ -85,12 +94,6 @@ class Vectorized<float> {
  operator float32x4_t() const {
    return values;
  }
-#ifdef CPU_CAPABILITY_SVE128
-  Vectorized(svfloat32_t v) : values(svget_neonq(v)) {}
-  operator svfloat32_t() const {
-    return svset_neonq(svundef_f32(), values);
-  }
-#endif
  template <int64_t mask>
  static Vectorized<float> blend(
      const Vectorized<float>& a,
--- a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
@ -4,6 +4,7 @@
 // See Note [Do not compile initializers with AVX]

 #include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec128/vec128_convert.h>
 #include <ATen/cpu/vec/vec128/vec128_float_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
 #include <ATen/cpu/vec/vec_base.h>
@ -24,6 +25,7 @@ inline namespace CPU_CAPABILITY {
 //    https://bugs.llvm.org/show_bug.cgi?id=45824
 // Most likely we will do aarch32 support with inline asm.
 #if !defined(C10_MOBILE) && defined(__aarch64__)
+
 #ifdef __BIG_ENDIAN__
 #error "Big endian is not supported."
 #endif
@ -419,24 +421,6 @@ Vectorized<c10::Half> inline operator+(
 #endif
 }

-inline void load_fp32_from_fp16(const c10::Half* data, Vectorized<float>& out) {
-  __at_align__ float values[Vectorized<float>::size()];
-  for (const auto k : c10::irange(Vectorized<float>::size())) {
-    values[k] = data[k];
-  }
-  out = Vectorized<float>::loadu(values);
-}
-
-inline void load_fp32_from_fp16(
-    const c10::Half* data,
-    Vectorized<float>& out1,
-    Vectorized<float>& out2) {
-  Vectorized<c10::Half> f16_vec = Vectorized<c10::Half>::loadu(data);
-  auto floats = convert_half_float(f16_vec);
-  out1 = std::get<0>(floats);
-  out2 = std::get<1>(floats);
-}
-
 template <>
 Vectorized<c10::Half> inline operator-(
    const Vectorized<c10::Half>& a,
@ -672,53 +656,6 @@ Vectorized<c10::Half> inline fnmsub(
  return -a * b - c;
 #endif
 }
-
-#else
-
-#define CONVERT_NON_VECTORIZED_INIT(type, name)                     \
-  inline std::tuple<Vectorized<float>, Vectorized<float>>           \
-      convert_##name##_float(const Vectorized<type>& a) {           \
-    constexpr int64_t K = Vectorized<type>::size();                 \
-    __at_align__ float arr[K];                                      \
-    __at_align__ type arr2[K];                                      \
-    a.store(arr2);                                                  \
-    convert(arr2, arr, K);                                          \
-    return std::make_tuple(                                         \
-        Vectorized<float>::loadu(arr),                              \
-        Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
-  }                                                                 \
-  inline Vectorized<type> convert_float_##name(                     \
-      const Vectorized<float>& a, const Vectorized<float>& b) {     \
-    constexpr int64_t K = Vectorized<type>::size();                 \
-    __at_align__ float arr[K];                                      \
-    __at_align__ type arr2[K];                                      \
-    a.store(arr);                                                   \
-    b.store(arr + Vectorized<float>::size());                       \
-    convert(arr, arr2, K);                                          \
-    return Vectorized<type>::loadu(arr2);                           \
-  }
-
-#define LOAD_FP32_NON_VECTORIZED_INIT(type, name)                           \
-  inline void load_fp32_from_##name(                                        \
-      const type* data, Vectorized<float>& out) {                           \
-    __at_align__ float values[Vectorized<float>::size()];                   \
-    for (const auto k : c10::irange(Vectorized<float>::size())) {           \
-      values[k] = data[k];                                                  \
-    }                                                                       \
-    out = Vectorized<float>::loadu(values);                                 \
-  }                                                                         \
-                                                                            \
-  inline void load_fp32_from_##name(                                        \
-      const type* data, Vectorized<float>& out1, Vectorized<float>& out2) { \
-    load_fp32_from_##name(data, out1);                                      \
-    data += Vectorized<float>::size();                                      \
-    load_fp32_from_##name(data, out2);                                      \
-  }
-
-CONVERT_NON_VECTORIZED_INIT(Half, half)
-
-LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16)
-
 #endif // !defined(C10_MOBILE) && defined(__aarch64__)

 } // namespace CPU_CAPABILITY
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@ -9,16 +9,21 @@
 #if !(                                                 \
    defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || \
    defined(CPU_CAPABILITY_ZVECTOR))
-#include <ATen/cpu/vec/vec256/vec256_double.h>
+#if defined(CPU_CAPABILITY_SVE256)
+#include <ATen/cpu/vec/sve/vec_common_sve.h>
+#else
+// clang-format off
 #include <ATen/cpu/vec/vec256/vec256_float.h>
+#include <ATen/cpu/vec/vec256/vec256_double.h>
 #include <ATen/cpu/vec/vec256/vec256_int.h>
 #include <ATen/cpu/vec/vec256/vec256_qint.h>
+#endif
 #if !defined(CPU_CAPABILITY_SVE256) || !defined(__ARM_FEATURE_BF16)
 #include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
 #endif
-#include <ATen/cpu/vec/vec256/vec256_complex_double.h>
-#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
 #include <ATen/cpu/vec/vec256/vec256_half.h>
+#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
+#include <ATen/cpu/vec/vec256/vec256_complex_double.h>
 // clang-format on
 #elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX)
 #include <ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h>
@ -51,6 +56,34 @@ namespace at::vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

+inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
+  stream << val.val_;
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
+  stream << static_cast<int>(val.val_);
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
+  stream << static_cast<unsigned int>(val.val_);
+  return stream;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
+  T buf[Vectorized<T>::size()];
+  vec.store(buf);
+  stream << "vec[";
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    if (i != 0) {
+      stream << ", ";
+    }
+    stream << buf[i];
+  }
+  stream << "]";
+  return stream;
+}
+
 #if defined(CPU_CAPABILITY_AVX2)

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@ -268,7 +268,9 @@ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)

 #else // defined(CPU_CAPABILITY_AVX2)

-#if !(defined(__aarch64__))
+#if !(                                                                      \
+    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
+    !defined(CPU_CAPABILITY_SVE256))
 CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
 #endif

--- a/aten/src/ATen/cpu/vec/vec256/vec256_half.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_half.h
@ -268,7 +268,9 @@ LOAD_FP32_VECTORIZED_INIT(Half, fp16)

 #else // defined(CPU_CAPABILITY_AVX2)

-#if !defined(__aarch64__) || defined(CPU_CAPABILITY_SVE256)
+#if !(                                                                      \
+    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
+    !defined(CPU_CAPABILITY_SVE256))
 CONVERT_NON_VECTORIZED_INIT(Half, half)
 #endif

--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@ -5,13 +5,6 @@

 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
-
-#ifdef __aarch64__
-#if defined(CPU_CAPABILITY_SVE128) || !defined(CPU_CAPABILITY_SVE)
-#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
-#endif
-#endif
-
 #include <ATen/native/quantized/AffineQuantizerBase.h>

 #include <c10/util/irange.h>
@ -922,7 +915,7 @@ Vectorized<c10::quint8> inline maximum(
  return a.maximum(b);
 }

-#else
+#elif !defined(CPU_CAPABILITY_SVE256)

 // NOTE: These are low-performance implementations that we fall back on
 // if we are not building with AVX2. This may not be an issue, because
@ -1379,18 +1372,12 @@ Vectorized<c10::quint8> inline maximum(
  return a.maximum(b);
 }

-#if defined(__aarch64__) && \
-    (defined(CPU_CAPABILITY_SVE128) || !defined(CPU_CAPABILITY_SVE))
+#endif // if defined(CPU_CAPABILITY_AVX2)
+
+#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
 std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
    at::vec::Vectorized<int8_t> src) {
-
-#ifdef CPU_CAPABILITY_SVE
-  svint8_t x = src;
-  auto s8x8 = vget_low_s8(svget_neonq(x));
-#else
  auto s8x8 = vld1_s8(src.operator const int8_t*());
-#endif
-
  auto s16x8 = vmovl_s8(s8x8);

  auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
@ -1415,14 +1402,7 @@ std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(

 Vectorized<float> inline convert_int8_half_register_to_float(
    at::vec::Vectorized<int8_t> src) {
-
-#ifdef CPU_CAPABILITY_SVE
-  svint8_t x = src;
-  auto s8x8 = vget_low_s8(svget_neonq(x));
-#else
  auto s8x8 = vld1_s8(src.operator const int8_t*());
-#endif
-
  auto s16x8 = vmovl_s8(s8x8);

  auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
@ -1440,8 +1420,5 @@ Vectorized<float> inline convert_int8_half_register_to_float(
 }

 #endif
-
-#endif // if defined(CPU_CAPABILITY_AVX2)
-
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@ -31,6 +31,34 @@ namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

+inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
+  stream << val.val_;
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
+  stream << static_cast<int>(val.val_);
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
+  stream << static_cast<unsigned int>(val.val_);
+  return stream;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
+  T buf[Vectorized<T>::size()];
+  vec.store(buf);
+  stream << "vec[";
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    if (i != 0) {
+      stream << ", ";
+    }
+    stream << buf[i];
+  }
+  stream << "]";
+  return stream;
+}
+
 #if defined(CPU_CAPABILITY_AVX512)

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512)
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -67,7 +67,18 @@ Windows llvm will not have this definition.
 #endif
 #define VECTOR_WIDTH 64
 #define int_vector __m512i
-#elif defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_SVE256)
+#elif defined(__aarch64__) && \
+    !defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
+// SVE code expects 256-vectors; leave that set for SVE?
+#if defined(__GNUC__)
+#define __at_align__ __attribute__((aligned(16)))
+#elif defined(_WIN32)
+#define __at_align__ __declspec(align(16))
+#else
+#define __at_align__
+#endif
+#define VECTOR_WIDTH 16
+#else // CPU_CAPABILITY_AVX512
 #if defined(__GNUC__)
 #define __at_align__ __attribute__((aligned(32)))
 #elif defined(_WIN32)
@ -77,27 +88,7 @@ Windows llvm will not have this definition.
 #endif
 #define VECTOR_WIDTH 32
 #define int_vector __m256i
-#elif defined(__aarch64__)
-// Define alignment and vector width for SVE128/Default (e.g., NEON)
-#if defined(__GNUC__)
-#define __at_align__ __attribute__((aligned(16)))
-#elif defined(_WIN32)
-#define __at_align__ __declspec(align(16))
-#else
-#define __at_align__
-#endif
-#define VECTOR_WIDTH 16
-#else
-// Fallback: define default alignment and vector width
-#if defined(__GNUC__)
-#define __at_align__ __attribute__((aligned(32)))
-#elif defined(_WIN32)
-#define __at_align__ __declspec(align(32))
-#else
-#define __at_align__
-#endif
-#define VECTOR_WIDTH 32
-#endif
+#endif // CPU_CAPABILITY_AVX512

 namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@ -151,11 +151,6 @@ struct CUDACachingHostAllocatorImpl
  }

  bool query_event(EventPool::Event& event) override {
-    // Do not call cudaEventQuery if capturing is underway
-    if (at::cuda::currentStreamCaptureStatusMayInitCtx() !=
-        at::cuda::CaptureStatus::None) {
-      return false;
-    }
    cudaError_t err = cudaEventQuery(*event);
    if (err == cudaErrorNotReady) {
      (void)cudaGetLastError(); // clear CUDA error
--- a/aten/src/ATen/mps/MPSHooks.mm
+++ b/aten/src/ATen/mps/MPSHooks.mm
@ -70,10 +70,7 @@ void MPSHooks::commitStream() const {
 }

 void* MPSHooks::getCommandBuffer() const {
-  auto stream = at::mps::getDefaultMPSStream();
-  // Release pending computeCommandEncoder, as extensions is likely to allocate new one
-  stream->endKernelCoalescing();
-  return stream->commandBuffer();
+  return at::mps::getDefaultMPSStream()->commandBuffer();
 }

 void* MPSHooks::getDispatchQueue() const {
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@ -158,18 +158,7 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t
      endKernelCoalescing();
      id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];

-      // For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26
-      // See https://github.com/pytorch/pytorch/issues/163962
-      // Workaround by batching copy commands into 4Gb chunks
-      constexpr size_t max_copy_size = 0x100000000; // 4GB
-      size_t bytes_filled = 0;
-      size_t bytes_remains = length;
-      while (bytes_remains > 0) {
-        NSUInteger bytes_to_copy = std::min(max_copy_size, bytes_remains);
-        [blitEncoder fillBuffer:buffer range:NSMakeRange(offset + bytes_filled, bytes_to_copy) value:value];
-        bytes_filled += bytes_to_copy;
-        bytes_remains -= bytes_to_copy;
-      }
+      [blitEncoder fillBuffer:buffer range:NSMakeRange(offset, length) value:value];
      [blitEncoder endEncoding];
      synchronize(syncType);
    }
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@ -670,8 +670,6 @@ Tensor rrelu_with_noise_backward(
 }

 Tensor rrelu(const Tensor & self, const Scalar& lower, const Scalar& upper, bool training, std::optional<Generator> generator) {
-  TORCH_CHECK(std::isfinite(lower.to<double>()), "rrelu: lower bound must be finite, got ", lower.to<double>());
-  TORCH_CHECK(std::isfinite(upper.to<double>()), "rrelu: upper bound must be finite, got ", upper.to<double>());
  TORCH_CHECK(lower.to<double>() <= upper.to<double>(), "Lower bound should be less than or equal to the upper bound")
  auto noise = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
  return at::rrelu_with_noise(self, noise, lower, upper, training, std::move(generator));
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@ -1157,103 +1157,103 @@ REGISTER_AVX512_DISPATCH(cholesky_stub, &cholesky_kernel)
 REGISTER_AVX2_DISPATCH(cholesky_stub, &cholesky_kernel)
 REGISTER_VSX_DISPATCH(cholesky_stub, &cholesky_kernel)
 REGISTER_ZVECTOR_DISPATCH(cholesky_stub, &cholesky_kernel)
-REGISTER_SVE_DISPATCH(cholesky_stub, &cholesky_kernel)
+REGISTER_SVE256_DISPATCH(cholesky_stub, &cholesky_kernel)

 REGISTER_ARCH_DISPATCH(cholesky_inverse_stub, DEFAULT, &cholesky_inverse_kernel_impl)
 REGISTER_AVX512_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
 REGISTER_AVX2_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
 REGISTER_VSX_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
 REGISTER_ZVECTOR_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
-REGISTER_SVE_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
+REGISTER_SVE256_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)

 REGISTER_ARCH_DISPATCH(linalg_eig_stub, DEFAULT, &linalg_eig_kernel)
 REGISTER_AVX512_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
 REGISTER_AVX2_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
 REGISTER_VSX_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
 REGISTER_ZVECTOR_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
-REGISTER_SVE_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
+REGISTER_SVE256_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)

 REGISTER_ARCH_DISPATCH(linalg_eigh_stub, DEFAULT, &linalg_eigh_kernel)
 REGISTER_AVX512_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
 REGISTER_AVX2_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
 REGISTER_VSX_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
 REGISTER_ZVECTOR_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
-REGISTER_SVE_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
+REGISTER_SVE256_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)

 REGISTER_ARCH_DISPATCH(geqrf_stub, DEFAULT, &geqrf_kernel)
 REGISTER_AVX512_DISPATCH(geqrf_stub, &geqrf_kernel)
 REGISTER_AVX2_DISPATCH(geqrf_stub, &geqrf_kernel)
 REGISTER_VSX_DISPATCH(geqrf_stub, &geqrf_kernel)
 REGISTER_ZVECTOR_DISPATCH(geqrf_stub, &geqrf_kernel)
-REGISTER_SVE_DISPATCH(geqrf_stub, &geqrf_kernel)
+REGISTER_SVE256_DISPATCH(geqrf_stub, &geqrf_kernel)

 REGISTER_ARCH_DISPATCH(orgqr_stub, DEFAULT, &orgqr_kernel_impl)
 REGISTER_AVX512_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
 REGISTER_AVX2_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
 REGISTER_VSX_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
 REGISTER_ZVECTOR_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
-REGISTER_SVE_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
+REGISTER_SVE256_DISPATCH(orgqr_stub, &orgqr_kernel_impl)

 REGISTER_ARCH_DISPATCH(ormqr_stub, DEFAULT, &ormqr_kernel)
 REGISTER_AVX512_DISPATCH(ormqr_stub, &ormqr_kernel)
 REGISTER_AVX2_DISPATCH(ormqr_stub, &ormqr_kernel)
 REGISTER_VSX_DISPATCH(ormqr_stub, &ormqr_kernel)
 REGISTER_ZVECTOR_DISPATCH(ormqr_stub, &ormqr_kernel)
-REGISTER_SVE_DISPATCH(ormqr_stub, &ormqr_kernel)
+REGISTER_SVE256_DISPATCH(ormqr_stub, &ormqr_kernel)

 REGISTER_ARCH_DISPATCH(lstsq_stub, DEFAULT, &lstsq_kernel)
 REGISTER_AVX512_DISPATCH(lstsq_stub, &lstsq_kernel)
 REGISTER_AVX2_DISPATCH(lstsq_stub, &lstsq_kernel)
 REGISTER_VSX_DISPATCH(lstsq_stub, &lstsq_kernel)
 REGISTER_ZVECTOR_DISPATCH(lstsq_stub, &lstsq_kernel)
-REGISTER_SVE_DISPATCH(lstsq_stub, &lstsq_kernel)
+REGISTER_SVE256_DISPATCH(lstsq_stub, &lstsq_kernel)

 REGISTER_ARCH_DISPATCH(triangular_solve_stub, DEFAULT, &triangular_solve_kernel)
 REGISTER_AVX512_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
 REGISTER_AVX2_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
 REGISTER_VSX_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
 REGISTER_ZVECTOR_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
-REGISTER_SVE_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
+REGISTER_SVE256_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)

 REGISTER_ARCH_DISPATCH(lu_factor_stub, DEFAULT, &lu_factor_kernel)
 REGISTER_AVX512_DISPATCH(lu_factor_stub, &lu_factor_kernel)
 REGISTER_AVX2_DISPATCH(lu_factor_stub, &lu_factor_kernel)
 REGISTER_VSX_DISPATCH(lu_factor_stub, &lu_factor_kernel)
 REGISTER_ZVECTOR_DISPATCH(lu_factor_stub, &lu_factor_kernel)
-REGISTER_SVE_DISPATCH(lu_factor_stub, &lu_factor_kernel)
+REGISTER_SVE256_DISPATCH(lu_factor_stub, &lu_factor_kernel)

 REGISTER_ARCH_DISPATCH(ldl_factor_stub, DEFAULT, &ldl_factor_kernel)
 REGISTER_AVX512_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
 REGISTER_AVX2_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
 REGISTER_VSX_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
 REGISTER_ZVECTOR_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
-REGISTER_SVE_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
+REGISTER_SVE256_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)

 REGISTER_ARCH_DISPATCH(ldl_solve_stub, DEFAULT, &ldl_solve_kernel)
 REGISTER_AVX512_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
 REGISTER_AVX2_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
 REGISTER_VSX_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
 REGISTER_ZVECTOR_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
-REGISTER_SVE_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
+REGISTER_SVE256_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)

 REGISTER_ARCH_DISPATCH(lu_solve_stub, DEFAULT, &lu_solve_kernel)
 REGISTER_AVX512_DISPATCH(lu_solve_stub, &lu_solve_kernel)
 REGISTER_AVX2_DISPATCH(lu_solve_stub, &lu_solve_kernel)
 REGISTER_VSX_DISPATCH(lu_solve_stub, &lu_solve_kernel)
 REGISTER_ZVECTOR_DISPATCH(lu_solve_stub, &lu_solve_kernel)
-REGISTER_SVE_DISPATCH(lu_solve_stub, &lu_solve_kernel)
+REGISTER_SVE256_DISPATCH(lu_solve_stub, &lu_solve_kernel)

 REGISTER_ARCH_DISPATCH(svd_stub, DEFAULT, &svd_kernel)
 REGISTER_AVX512_DISPATCH(svd_stub, &svd_kernel)
 REGISTER_AVX2_DISPATCH(svd_stub, &svd_kernel)
 REGISTER_VSX_DISPATCH(svd_stub, &svd_kernel)
 REGISTER_ZVECTOR_DISPATCH(svd_stub, &svd_kernel)
-REGISTER_SVE_DISPATCH(svd_stub, &svd_kernel)
+REGISTER_SVE256_DISPATCH(svd_stub, &svd_kernel)

 REGISTER_ARCH_DISPATCH(unpack_pivots_stub, DEFAULT, &unpack_pivots_cpu_kernel)
 REGISTER_AVX512_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
 REGISTER_AVX2_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
 REGISTER_VSX_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
-REGISTER_SVE_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
+REGISTER_SVE256_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
 } // namespace at::native
--- a/aten/src/ATen/native/DispatchStub.cpp
+++ b/aten/src/ATen/native/DispatchStub.cpp
@ -39,21 +39,19 @@ static CPUCapability compute_cpu_capability() {
    }
 #elif defined(HAVE_SVE_CPU_DEFINITION)
    int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
-    if (envar == "sve") {
-      // Select SVE capability based on the maximum SVE VL supported by the HW.
+#ifdef HAVE_SVE256_CPU_DEFINITION
+    if (envar == "sve256") {
      if (sve_vl == 256) {
+#ifdef HAVE_ARM_BF16_CPU_DEFINITION
        if (cpuinfo_has_arm_bf16()) {
          return CPUCapability::SVE256;
        }
-      } else if (sve_vl == 128) {
-        if (cpuinfo_has_arm_bf16()) {
-          return CPUCapability::SVE128;
-        }
-      } else {
-        TORCH_WARN("SVE capability not available on hardware. Falling back to DEFAULT");
-        return CPUCapability::DEFAULT;
+#endif
      }
+      TORCH_WARN("SVE256 capability not available on hardware. Falling back to DEFAULT");
+      return CPUCapability::DEFAULT;
    }
+#endif
 #else
 #ifdef HAVE_AVX512_CPU_DEFINITION
    if (envar == "avx512") {
@ -115,11 +113,6 @@ static CPUCapability compute_cpu_capability() {
        #endif
        }
    #endif
-    #ifdef HAVE_SVE128_CPU_DEFINITION
-        if (sve_vl == 128) { // Check for SVE128
-            return CPUCapability::SVE128;
-        }
-    #endif
    // Return the default CPU capability.
    return CPUCapability::DEFAULT;
  }
@ -154,9 +147,6 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
 #ifdef HAVE_SVE256_CPU_DEFINITION
  , void *SVE256
 #endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-  , void *SVE128
-#endif
 ) {
  constexpr auto supported_devices = c10::array_of<c10::DeviceType>(
        c10::DeviceType::CPU,
@ -194,9 +184,6 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
          , SVE256
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-          , SVE128
 #endif
        );
        if (!std::holds_alternative<ErrorType>(result)) {
@ -255,9 +242,6 @@ void* DispatchStubImpl::get_call_ptr(
 #ifdef HAVE_SVE256_CPU_DEFINITION
  , void *SVE256
 #endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-  , void *SVE128
-#endif
 ) {

  auto result = try_get_call_ptr(
@ -282,10 +266,6 @@ void* DispatchStubImpl::get_call_ptr(
 #ifdef HAVE_SVE256_CPU_DEFINITION
      ,
      SVE256
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-      ,
-      SVE128
 #endif
  );
  if (std::holds_alternative<ErrorType>(result)) {
@ -320,9 +300,6 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
    , void *SVE256
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-    , void *SVE128
 #endif
  ){

@ -365,16 +342,6 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
      return DispatchResult(SVE256);
    }
  }
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-  if (capability >= static_cast<int>(CPUCapability::SVE128)) {
-    if (C10_UNLIKELY(!SVE128)) {
-      // dispatch to DEFAULT, since the SVE kernel is missing
-      return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
-    } else {
-      return DispatchResult(SVE128);
-    }
-  }
 #endif
  return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
 }
@ -396,9 +363,6 @@ void* DispatchStubImpl::choose_cpu_impl(
 #ifdef HAVE_SVE256_CPU_DEFINITION
  , void *SVE256
 #endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-  , void *SVE128
-#endif
 ) {
  auto capability = static_cast<int>(get_cpu_capability());
  (void)capability;
@ -444,17 +408,6 @@ void* DispatchStubImpl::choose_cpu_impl(
      return SVE256;
    }
  }
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-  if (capability >= static_cast<int>(CPUCapability::SVE128)) {
-    if (C10_UNLIKELY(!SVE128)) {
-      // dispatch to DEFAULT, since the SVE kernel is missing
-      TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
-      return DEFAULT;
-    } else {
-      return SVE128;
-    }
-  }
 #endif
  TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
  return DEFAULT;
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@ -64,9 +64,8 @@ enum class CPUCapability {
  VSX = 1,
 #elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
  ZVECTOR = 1,
-#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
+#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
  SVE256 = 1,
-  SVE128 = 2,
 #else
  AVX2 = 1,
  AVX512 = 2,
@ -118,9 +117,6 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
      , void *SVE256
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-      , void *SVE128
 #endif
  );

@ -142,9 +138,6 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
    , void *SVE256
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-    , void *SVE128
 #endif
  );

@ -166,9 +159,6 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
      , void *SVE256
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-      , void *SVE128
 #endif
  );

@ -193,9 +183,6 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
    , void *SVE256
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-    , void *SVE128
 #endif
  );

@ -253,9 +240,6 @@ private:
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
      , reinterpret_cast<void*>(SVE256)
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-      , reinterpret_cast<void*>(SVE128)
 #endif
      )
    );
@ -317,9 +301,6 @@ public:
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
      , reinterpret_cast<void*>(SVE256)
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-      , reinterpret_cast<void*>(SVE128)
 #endif
      );
    if (std::holds_alternative<ErrorType>(result)){
@ -344,9 +325,6 @@ public:
 #ifdef HAVE_SVE256_CPU_DEFINITION
  static TORCH_API FnPtr SVE256;
 #endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-  static TORCH_API FnPtr SVE128;
-#endif
 private:
  DispatchStubImpl impl;
 };
@ -454,12 +432,6 @@ struct RegisterPRIVATEUSE1Dispatch {
 #define REGISTER_SVE256_DISPATCH(name, fn)
 #endif

-#ifdef HAVE_SVE128_CPU_DEFINITION
-#define REGISTER_SVE128_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, SVE128, fn)
-#else
-#define REGISTER_SVE128_DISPATCH(name, fn)
-#endif
-
 // Macro to register the same kernel for all CPU arch types. This is useful
 // if a kernel does not benefit from being recompiled across different arch types.
 #define REGISTER_ALL_CPU_DISPATCH(name, fn)                                    \
@ -468,11 +440,6 @@ struct RegisterPRIVATEUSE1Dispatch {
  REGISTER_AVX2_DISPATCH(name, fn)                                             \
  REGISTER_VSX_DISPATCH(name, fn)                                              \
  REGISTER_ZVECTOR_DISPATCH(name, fn)                                          \
-  REGISTER_SVE256_DISPATCH(name, fn)                                           \
-  REGISTER_SVE128_DISPATCH(name, fn)
-
-#define REGISTER_SVE_DISPATCH(name, fn)                                        \
-  REGISTER_SVE128_DISPATCH(name, fn)                                           \
  REGISTER_SVE256_DISPATCH(name, fn)

 #define REGISTER_NO_CPU_DISPATCH(name)                                         \
@ -515,7 +482,6 @@ struct RegisterPRIVATEUSE1Dispatch {
 // REGISTER_DISPATCH now dispatches an AVX512 kernel to nullptr but registers other dispatches.
 // ALSO_REGISTER_AVX512_DISPATCH should be used for ensuring AVX512 dispatch, among others.
 // ALSO_REGISTER_SVE256_DISPATCH should be used for ensuring SVE256 dispatch, among others.
-// ALSO_REGISTER_SVE128_DISPATCH should be used for ensuring SVE128 dispatch, among others.
 #ifdef CPU_CAPABILITY_AVX512
 #define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, ((void*)(fn) ? nullptr : nullptr))
 #else
@ -523,7 +489,6 @@ struct RegisterPRIVATEUSE1Dispatch {
 #endif
 #define ALSO_REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
 #define ALSO_REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
-#define ALSO_REGISTER_SVE128_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
 #endif
 } // namespace at::native

--- a/aten/src/ATen/native/SegmentReduce.cpp
+++ b/aten/src/ATen/native/SegmentReduce.cpp
@ -466,7 +466,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cp
 REGISTER_AVX512_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
 REGISTER_VSX_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
-REGISTER_SVE_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
+REGISTER_SVE256_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)

 // offsets dispatches
 REGISTER_ARCH_DISPATCH(
@ -477,7 +477,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cp
 REGISTER_AVX512_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
 REGISTER_VSX_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
-REGISTER_SVE_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
+REGISTER_SVE256_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)

 // Currently some computation is being duplicated across forward and backward.
 // TODO: Cache indices in forward pass to reuse in backward
@ -548,7 +548,7 @@ REGISTER_VSX_DISPATCH(
 REGISTER_ZVECTOR_DISPATCH(
    _segment_reduce_lengths_backward_stub,
    &_segment_reduce_cpu_lengths_backward_kernel)
-REGISTER_SVE_DISPATCH(
+REGISTER_SVE256_DISPATCH(
    _segment_reduce_lengths_backward_stub,
    &_segment_reduce_cpu_lengths_backward_kernel)

@ -568,7 +568,7 @@ REGISTER_VSX_DISPATCH(
 REGISTER_ZVECTOR_DISPATCH(
    _segment_reduce_offsets_backward_stub,
    &_segment_reduce_cpu_offsets_backward_kernel)
-REGISTER_SVE_DISPATCH(
+REGISTER_SVE256_DISPATCH(
    _segment_reduce_offsets_backward_stub,
    &_segment_reduce_cpu_offsets_backward_kernel)

--- a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
@ -212,7 +212,7 @@ std::pair<vec::Vectorized<float>, vec::Vectorized<float>> fmadd(
    const vec::Vectorized<c10::Half>& b,
    const vec::Vectorized<float>& acc_low,
    const vec::Vectorized<float>& acc_high) {
-#if defined(__aarch64__) && ((defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)) || (defined(CPU_CAPABILITY_SVE128)))
+#if defined(__ARM_FEATURE_FP16_FML) && !defined(CPU_CAPABILITY_SVE)
  return std::make_pair(vfmlalq_low_f16(acc_low, a, b), vfmlalq_high_f16(acc_high, a, b));
 #else
  const auto [a_float_low, a_float_high] = convert_half_float(a);
@ -233,7 +233,7 @@ std::pair<vec::Vectorized<float>, vec::Vectorized<float>> fmadd(

 // Return a + b_low * c_low + b_high * c_high
 vec::Vectorized<float> fmadd(vec::Vectorized<float> a, vec::Vectorized<Half> b, vec::Vectorized<Half> c) {
-#if defined(__aarch64__) && ((defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)) || (defined(CPU_CAPABILITY_SVE128)))
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)
  // NOTE: this instruction is an optional instruction in ARM v8.2 and
  // v8.3, but mandatory in v8.4 per
  // https://developer.arm.com/documentation/ddi0596/2021-03/SIMD-FP-Instructions/FMLAL--FMLAL2--vector---Floating-point-fused-Multiply-Add-Long-to-accumulator--vector--?lang=en
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -1124,17 +1124,6 @@ bool is_rowwise_scaling(const at::Tensor& t, const at::Tensor& scale) {
      && scale.is_contiguous());
 }

-bool check_size_stride(const at::Tensor& scale, int dim, int size, int stride) {
-  // For Blockwise1x128 and Blockwise128x128,
-  // when the scale tensor has a dimension of size 1, the stride is effectively
-  // "meaningless", i.e. PyTorch decides to use a stride of 1. Thus, the regular
-  // stride check fails. Here, we relax the stride check when the effective
-  // stride is 1.
-
-  return (
-      scale.size(dim) == size && (size <= 1 || scale.stride(dim) == stride));
-}
-
 // 1x16 blocks for packed nvfp4 data and fp8_e4m3fn scales
 bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) {
  // Multiply t.size(1) by 2 to adjust for fp4x2 packing
@ -1160,24 +1149,15 @@ bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) {
 }

 bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
-  return (
-      isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat &&
-      scale.dim() == 2 && check_size_stride(scale, 0, t.size(0), 1) &&
-      check_size_stride(
-          scale, 1, ceil_div<int64_t>(t.size(1), 128), t.size(0)));
+  return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && scale.dim() == 2
+      && scale.size(0) == t.size(0) && scale.size(1) == ceil_div<int64_t>(t.size(1), 128)
+      && scale.stride(0) == 1 && scale.stride(1) == t.size(0));
 }

 bool is_blockwise_128x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
-  return (
-      isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat &&
-      scale.dim() == 2 &&
-      check_size_stride(
-          scale,
-          0,
-          ceil_div<int64_t>(t.size(0), 128),
-          round_up<int64_t>(ceil_div<int64_t>(t.size(1), 128), 4)) &&
-      check_size_stride(
-          scale, 1, ceil_div<int64_t>(t.size(1), 128), 1));
+  return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && scale.dim() == 2
+      && scale.size(0) == ceil_div<int64_t>(t.size(0), 128) && scale.size(1) == ceil_div<int64_t>(t.size(1), 128)
+      && scale.stride(0) == round_up<int64_t>(ceil_div<int64_t>(t.size(1), 128), 4) && scale.stride(1) == 1);
 }

 bool is_desired_scaling(const at::Tensor& t, const at::Tensor& scale, ScalingType desired_scaling) {
@ -1831,37 +1811,6 @@ std::optional<c10::ScalarType> out_dtype) {
  return out;
 }

-static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
-  // ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
-  TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
-  TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
-
-  const auto batch1_sizes = batch1.sizes();
-  const auto batch2_sizes = batch2.sizes();
-
-  int64_t bs = batch1_sizes[0];
-  int64_t contraction_size = batch1_sizes[2];
-  int64_t res_rows = batch1_sizes[1];
-  int64_t res_cols = batch2_sizes[2];
-  std::vector<int64_t> output_size {bs, res_rows, res_cols};
-
-  TORCH_CHECK(batch2_sizes[0] == bs && batch2_sizes[1] == contraction_size,
-              "Expected size for first two dimensions of batch2 tensor to be: [",
-              bs, ", ", contraction_size, "] but got: [", batch2_sizes[0], ", ", batch2_sizes[1], "].");
-
-  TORCH_CHECK(batch1.scalar_type() == batch2.scalar_type(), "batch1 and batch2 must have the same dtype");
-
-  TORCH_CHECK(out_dtype == batch1.scalar_type() ||
-    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
-    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
-
-  if (!is_bmm && self_baddbmm.has_value()) {
-    const auto& self = self_baddbmm.value();
-    TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
-    TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
-  }
-}
-
 Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
  IntArrayRef batch1_sizes = batch1.sizes();
  IntArrayRef batch2_sizes = batch2.sizes();
@ -1871,7 +1820,12 @@ Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::Sca
 }

 Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+
+  TORCH_CHECK(out_dtype == batch1.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+
  Scalar beta(0.0);
  Scalar alpha(1.0);
  {
@ -1890,7 +1844,12 @@ Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tenso
 }

 Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+
+  TORCH_CHECK(out_dtype == batch1.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+
  {
    NoNamesGuard guard;
    baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
@ -1905,12 +1864,6 @@ Tensor _mm_dtype_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarTy
 }

 Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarType out_dtype, Tensor &out) {
-  TORCH_CHECK(self.dim() == 2,  "self must be a matrix, got ", self.dim(), "-D tensor");
-  TORCH_CHECK(mat2.dim() == 2,  "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
-  TORCH_CHECK(
-      self.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
-      self.sizes()[0], "x", self.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
-
  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
  TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "input dtypes must be the same");
  TORCH_CHECK(out_dtype == self.scalar_type() ||
@ -1930,14 +1883,6 @@ Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& m
 }

 Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-  TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
-  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
-  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
-  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
-  TORCH_CHECK(
-      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
-      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
-
  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
  TORCH_CHECK(out_dtype == self.scalar_type() ||
    (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@ -5,7 +5,6 @@
 #include <array>
 #include <type_traits>
 #include <ATen/core/TensorBase.h>
-#include <ATen/ceil_div.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Dispatch_v2.h>
 #include <ATen/cuda/CUDAContext.h>
@ -84,17 +83,11 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
        auto ind_dim_size = index_size[0];
        auto inp_stride_bytes = index_stride[0];
        auto out_stride_bytes = iter.strides(0)[1];
-        // avoid grid overflow in the fast kernel
-        const int64_t vec_chunks = ceil_div(slice_size, alignment);
-        const int64_t blocks_per_slice_upper = ceil_div(vec_chunks, (int64_t)launch_size_nd);
-        const int max_grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
-        // if it's an eligible grid we use the fast path, otherwise default to slower path
-        if (blocks_per_slice_upper <= max_grid_y) {
-          at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
-          slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
-          return;
-        }
-    }
+        if (iter.numel() == 0) return;
+        at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
+        slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
+        return;
+      }
  }

  auto sizes = std::array<int64_t, MAX_DIMS>{};
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@ -165,7 +165,7 @@ REGISTER_AVX2_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_co
 REGISTER_AVX512_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
 REGISTER_ZVECTOR_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
 REGISTER_VSX_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
-REGISTER_SVE_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
+REGISTER_SVE256_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)

 // _out variants can be shared between PocketFFT and MKL
 Tensor& _fft_r2c_mkl_out(const Tensor& self, IntArrayRef dim, int64_t normalization,
--- a/aten/src/ATen/native/mps/kernels/Attention.metal
+++ b/aten/src/ATen/native/mps/kernels/Attention.metal
@ -14,8 +14,8 @@ template <typename T, int D, int V = D>
    device T* out [[buffer(3)]],
    const constant uint& gqa_factor [[buffer(4)]],
    const constant uint& N [[buffer(5)]],
-    const constant uint3& qkv_head_strides [[buffer(6)]],
-    const constant uint3& qkv_seq_strides [[buffer(7)]],
+    const constant uint2& k_head_seq_stride [[buffer(6)]],
+    const constant uint2& v_head_seq_stride [[buffer(7)]],
    const constant float& scale [[buffer(8)]],
    const device bool* mask [[buffer(9)]],
    const constant uint3& mask_strides [[buffer(10)]],
@ -28,12 +28,10 @@ template <typename T, int D, int V = D>
  constexpr uint BD = 32;
  constexpr uint qk_per_thread = D / BD;
  constexpr uint v_per_thread = V / BD;
-  const uint q_head_stride = qkv_head_strides.x;
-  const uint q_seq_stride = qkv_seq_strides.x;
-  const uint k_head_stride = qkv_head_strides.y;
-  const uint k_seq_stride = qkv_seq_strides.y;
-  const uint v_head_stride = qkv_head_strides.z;
-  const uint v_seq_stride = qkv_seq_strides.z;
+  const uint k_head_stride = k_head_seq_stride.x;
+  const uint k_seq_stride = k_head_seq_stride.y;
+  const uint v_head_stride = v_head_seq_stride.x;
+  const uint v_seq_stride = v_head_seq_stride.y;
  const uint mask_head_stride = mask_strides.x;
  const uint mask_kv_seq_stride = mask_strides.y;
  const uint mask_q_seq_stride = mask_strides.z;
@ -56,9 +54,9 @@ template <typename T, int D, int V = D>
  const int kv_head_idx = head_idx / gqa_factor;
  const int Q = tpg.y;
  const int group_offset = head_idx * Q + q_seq_idx;
+  const int q_offset = group_offset;
  const int o_offset = group_offset;
-  queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride +
-      simd_lid * qk_per_thread;
+  queries += q_offset * D + simd_lid * qk_per_thread;
  keys += kv_head_idx * k_head_stride + simd_gid * k_seq_stride +
      simd_lid * qk_per_thread;
  values += kv_head_idx * v_head_stride + simd_gid * v_seq_stride +
@ -158,8 +156,8 @@ template <typename T, int D, int V = D>
    device float* maxs [[buffer(5)]],
    const constant uint& gqa_factor [[buffer(6)]],
    const constant uint& N [[buffer(7)]],
-    const constant uint3& qkv_head_strides [[buffer(8)]],
-    const constant uint3& qkv_seq_strides [[buffer(9)]],
+    const constant uint2& k_head_seq_stride [[buffer(8)]],
+    const constant uint2& v_head_seq_stride [[buffer(9)]],
    const constant float& scale [[buffer(10)]],
    const device bool* mask [[buffer(11)]],
    const constant uint3& mask_strides [[buffer(12)]],
@ -172,12 +170,10 @@ template <typename T, int D, int V = D>
  constexpr int BD = 32;
  constexpr int qk_per_thread = D / BD;
  constexpr int v_per_thread = V / BD;
-  const int q_head_stride = qkv_head_strides.x;
-  const int q_seq_stride = qkv_seq_strides.x;
-  const int k_head_stride = qkv_head_strides.y;
-  const int k_seq_stride = qkv_seq_strides.y;
-  const int v_head_stride = qkv_head_strides.z;
-  const int v_seq_stride = qkv_seq_strides.z;
+  const int k_head_stride = k_head_seq_stride.x;
+  const int k_seq_stride = k_head_seq_stride.y;
+  const int v_head_stride = v_head_seq_stride.x;
+  const int v_seq_stride = v_head_seq_stride.y;
  const int mask_kv_seq_stride = mask_strides.x;
  const int mask_q_seq_stride = mask_strides.y;
  const int mask_head_stride = mask_strides.z;
@ -200,10 +196,10 @@ template <typename T, int D, int V = D>
  const int head_idx = tid.x;
  const int q_seq_idx = tid.y;
  const int o_offset = head_idx * tpg.y + q_seq_idx;
+  const int q_offset = o_offset;
  const int kv_head_idx = head_idx / gqa_factor;

-  queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride +
-      simd_lid * qk_per_thread;
+  queries += q_offset * D + simd_lid * qk_per_thread;
  keys += kv_head_idx * k_head_stride +
      (block_idx * BN + simd_gid) * k_seq_stride + simd_lid * qk_per_thread;
  values += kv_head_idx * v_head_stride +
@ -524,25 +520,25 @@ kernel void attention(
  }
 }

-#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM)   \
-  template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM    \
-                       "_" #VALUE_DIM)]] kernel void        \
-  sdpa_vector<DTYPE, QK_DIM, VALUE_DIM>(                    \
-      const device DTYPE* queries [[buffer(0)]],            \
-      const device DTYPE* keys [[buffer(1)]],               \
-      const device DTYPE* values [[buffer(2)]],             \
-      device DTYPE* out [[buffer(3)]],                      \
-      const constant uint& gqa_factor [[buffer(4)]],        \
-      const constant uint& N [[buffer(5)]],                 \
-      const constant uint3& qkv_head_strides [[buffer(6)]], \
-      const constant uint3& qkv_seq_strides [[buffer(7)]],  \
-      const constant float& scale [[buffer(8)]],            \
-      const device bool* mask [[buffer(9)]],                \
-      const constant uint3& mask_strides [[buffer(10)]],    \
-      const constant bool& has_mask [[buffer(11)]],         \
-      uint3 tid [[threadgroup_position_in_grid]],           \
-      uint3 tpg [[threadgroups_per_grid]],                  \
-      uint simd_gid [[simdgroup_index_in_threadgroup]],     \
+#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM)    \
+  template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM     \
+                       "_" #VALUE_DIM)]] kernel void         \
+  sdpa_vector<DTYPE, QK_DIM, VALUE_DIM>(                     \
+      const device DTYPE* queries [[buffer(0)]],             \
+      const device DTYPE* keys [[buffer(1)]],                \
+      const device DTYPE* values [[buffer(2)]],              \
+      device DTYPE* out [[buffer(3)]],                       \
+      const constant uint& gqa_factor [[buffer(4)]],         \
+      const constant uint& N [[buffer(5)]],                  \
+      const constant uint2& k_head_seq_stride [[buffer(6)]], \
+      const constant uint2& v_head_seq_stride [[buffer(7)]], \
+      const constant float& scale [[buffer(8)]],             \
+      const device bool* mask [[buffer(9)]],                 \
+      const constant uint3& mask_strides [[buffer(10)]],     \
+      const constant bool& has_mask [[buffer(11)]],          \
+      uint3 tid [[threadgroup_position_in_grid]],            \
+      uint3 tpg [[threadgroups_per_grid]],                   \
+      uint simd_gid [[simdgroup_index_in_threadgroup]],      \
      uint simd_lid [[thread_index_in_simdgroup]]);

 #define INSTANTIATE_SDPA_VECTOR_2PASS_1(DTYPE, QK_DIM, VALUE_DIM) \
@ -557,8 +553,8 @@ kernel void attention(
      device float* maxs [[buffer(5)]],                           \
      const constant uint& gqa_factor [[buffer(6)]],              \
      const constant uint& N [[buffer(7)]],                       \
-      const constant uint3& qkv_head_strides [[buffer(8)]],       \
-      const constant uint3& qkv_seq_strides [[buffer(9)]],        \
+      const constant uint2& k_head_seq_stride [[buffer(8)]],      \
+      const constant uint2& v_head_seq_stride [[buffer(9)]],      \
      const constant float& scale [[buffer(10)]],                 \
      const device bool* mask [[buffer(11)]],                     \
      const constant uint3& mask_strides [[buffer(12)]],          \
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
@ -25,7 +25,7 @@ struct ReductionOp {
  inline opmath_t<T> operator()(
      opmath_t<T> weight_val,
      opmath_t<T> out_val,
-      bool /*is_first*/) {
+      bool is_first) {
    return weight_val + out_val;
  }
 };
@ -44,9 +44,9 @@ template <EmbeddingBagMode M, typename T>
 struct MaybeApplyPerSampleWeight {
  inline opmath_t<T> operator()(
      opmath_t<T> weight_val,
-      uint32_t /*per_sample_weights_index*/,
-      constant T* /*per_sample_weights*/,
-      uint32_t /*per_sample_weights_stride*/) {
+      uint32_t per_sample_weights_index,
+      constant T* per_sample_weights,
+      uint32_t per_sample_weights_stride) {
    return weight_val;
  }
 };
@ -71,12 +71,12 @@ struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
 template <EmbeddingBagMode M, typename T, typename I>
 struct MaybeCalcMaxIndex {
  inline void operator()(
-      opmath_t<T> /*weight_val*/,
-      opmath_t<T> /*out_val*/,
-      bool /*is_first*/,
-      thread I& /*max_idx*/,
-      I /*weight_idx*/,
-      bool /*pad*/) {}
+      opmath_t<T> weight_val,
+      opmath_t<T> out_val,
+      bool is_first,
+      thread I& max_idx,
+      I weight_idx,
+      bool pad) {}
 };

 template <typename T, typename I>
--- a/aten/src/ATen/native/mps/operations/Attention.mm
+++ b/aten/src/ATen/native/mps/operations/Attention.mm
@ -182,8 +182,6 @@ static std::tuple<Tensor, Tensor> sdpa_vector_fast_mps(const Tensor& q_,
  uint maxSeqLength = k_.size(2);
  uint N = k_.size(2);
  uint B = q_.size(0) * q_.size(1);
-  uint q_head_stride = q_.stride(1);
-  uint q_seq_stride = q_.stride(2);
  uint k_head_stride = k_.stride(1);
  uint k_seq_stride = k_.stride(2);
  uint v_head_stride = v_.stride(1);
@ -211,8 +209,8 @@ static std::tuple<Tensor, Tensor> sdpa_vector_fast_mps(const Tensor& q_,
                  out,
                  1,
                  N,
-                  std::array<uint32_t, 3>{q_head_stride, k_head_stride, v_head_stride},
-                  std::array<uint32_t, 3>{q_seq_stride, k_seq_stride, v_seq_stride},
+                  std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
+                  std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
                  scale_factor);

      if (has_mask) {
@ -259,8 +257,6 @@ static std::tuple<Tensor, Tensor> sdpa_vector_2pass_mps(const Tensor& q_,
  uint B = batchSize * num_heads;
  uint gqa_factor = q_.size(1) / k_.size(1);

-  uint q_head_stride = q_.stride(1);
-  uint q_seq_stride = q_.stride(2);
  uint k_head_stride = k_.stride(1);
  uint k_seq_stride = k_.stride(2);
  uint v_head_stride = v_.stride(1);
@ -298,8 +294,8 @@ static std::tuple<Tensor, Tensor> sdpa_vector_2pass_mps(const Tensor& q_,
                  maxs,
                  gqa_factor,
                  N,
-                  std::array<uint32_t, 3>{q_head_stride, k_head_stride, v_head_stride},
-                  std::array<uint32_t, 3>{q_seq_stride, k_seq_stride, v_seq_stride},
+                  std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
+                  std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
                  scale_factor);

      if (has_mask) {
--- a/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
+++ b/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
@ -101,9 +101,6 @@ __device__ inline bool isinf_device(float v) {
 __device__ inline bool isinf_device(c10::BFloat16 v) {
  return ::isinf(static_cast<float>(v));
 }
-__device__ inline bool isinf_device(at::Half v) {
-  return ::isinf(static_cast<float>(v));
-}

 // CUDA kernel to compute Moving Average Min/Max of the tensor.
 // It uses the running_min and running_max along with averaging const, c.
@ -163,8 +160,8 @@ void _calculate_moving_average(
    std::tie(x_min, x_max) = at::aminmax(x, 1);
    int num_threads = std::min(size, (int64_t)512);
    const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
-    AT_DISPATCH_FLOATING_TYPES_AND2(
-        at::kBFloat16, at::kHalf, x.scalar_type(), "aminmax_kernel", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
          scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
          scalar_t* x_max_data = x_max.data_ptr<scalar_t>();

@ -184,8 +181,8 @@ void _calculate_moving_average(
    C10_CUDA_KERNEL_LAUNCH_CHECK();
  } else {
    std::tie(x_min, x_max) = at::aminmax(x);
-    AT_DISPATCH_FLOATING_TYPES_AND2(
-        at::kBFloat16, at::kHalf, x.scalar_type(), "aminmax_kernel", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
          scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
          scalar_t* x_max_data = x_max.data_ptr<scalar_t>();

@ -224,8 +221,8 @@ void _calc_moving_avg_qparams_helper(
  cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
  int64_t* fake_quant_on_data = fake_quant_on.data_ptr<int64_t>();
  if (per_row_fq) {
-    AT_DISPATCH_FLOATING_TYPES_AND2(
-        at::kBFloat16, at::kHalf, x.scalar_type(), "aminmax_kernel", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
          scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
          scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
          int num_threads = std::min(size, (int64_t)512);
@ -247,8 +244,8 @@ void _calc_moving_avg_qparams_helper(
        });
    C10_CUDA_KERNEL_LAUNCH_CHECK();
  } else {
-    AT_DISPATCH_FLOATING_TYPES_AND2(
-        at::kBFloat16, at::kHalf, x.scalar_type(), "aminmax_kernel", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
          scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
          scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
          ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
--- a/aten/src/ATen/native/sparse/FlattenIndicesKernel.cpp
+++ b/aten/src/ATen/native/sparse/FlattenIndicesKernel.cpp
@ -27,6 +27,6 @@ REGISTER_AVX512_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
 REGISTER_AVX2_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
 REGISTER_VSX_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
-REGISTER_SVE_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
+REGISTER_SVE256_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)

 } // namespace at::native
--- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
@ -161,19 +161,19 @@ REGISTER_AVX512_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_
 REGISTER_AVX2_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
 REGISTER_VSX_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
-REGISTER_SVE_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
+REGISTER_SVE256_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)

 REGISTER_ARCH_DISPATCH(sparse_mask_intersection_out_stub, DEFAULT, &sparse_mask_intersection_out_cpu_kernel)
 REGISTER_AVX512_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
 REGISTER_AVX2_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
 REGISTER_VSX_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
-REGISTER_SVE_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
+REGISTER_SVE256_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)

 REGISTER_ARCH_DISPATCH(sparse_mask_projection_out_stub, DEFAULT, &sparse_mask_projection_out_cpu_kernel)
 REGISTER_AVX512_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
 REGISTER_AVX2_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
 REGISTER_VSX_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
-REGISTER_SVE_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
+REGISTER_SVE256_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
 }
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@ -448,7 +448,7 @@ REGISTER_AVX2_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_AVX512_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_VSX_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_ZVECTOR_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
-REGISTER_SVE_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
+REGISTER_SVE256_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_HPU_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_meta)

 int64_t _fused_sdp_choice_meta(
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -316,12 +316,6 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
  return false;
 #endif
 #else
-  if (!at::cuda::is_available()) {
-    if (debug) {
-      TORCH_WARN("flash attention requires a CUDA device, which is not available.");
-    }
-    return false;
-  }
  auto dprops = at::cuda::getCurrentDeviceProperties();
  if (!check_sm_version<sm80, sm121>(dprops)) {
    if (debug) {
@ -373,12 +367,6 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
  return false;
 #endif
 #else
-  if (!at::cuda::is_available()) {
-    if (debug) {
-      TORCH_WARN("Mem Efficient attention requires a CUDA device, which is not available.");
-    }
-    return false;
-  }
  auto dprops = at::cuda::getCurrentDeviceProperties();
  if (!check_sm_version<sm50, sm121>(dprops)) {
    if (debug) {
@ -609,12 +597,6 @@ bool check_cudnn_layout(sdp_params const& params, bool debug) {
 bool check_cudnn_hardware_support(sdp_params const& params, bool debug) {
  using sm80 = SMVersion<8, 0>;
  using sm121 = SMVersion<12, 1>;
-  if (!at::cuda::is_available()) {
-    if (debug) {
-      TORCH_WARN("cuDNN SDPA requires a CUDA device, which is not available.");
-    }
-    return false;
-  }
  auto dprops = at::cuda::getCurrentDeviceProperties();
  if (!check_sm_version<sm80, sm121>(dprops)) {
    if (debug) {
--- a/benchmarks/operator_benchmark/README.md
+++ b/benchmarks/operator_benchmark/README.md
@ -20,7 +20,7 @@ Key Features:
 The instruction below installs a cpp\_extension for PyTorch and it is required to run the benchmark suite.
 ```bash
 cd pt_extension
-python -m pip install . -v --no-build-isolation
+python -m pip install .
 ```

 ## How to run the benchmarks:
--- a/c10/core/DeviceType.h
+++ b/c10/core/DeviceType.h
@ -1,16 +1,100 @@
 #pragma once

+// This is directly synchronized with caffe2/proto/caffe2.proto, but
+// doesn't require me to figure out how to get Protobuf headers into
+// ATen/core (which would require a lot more build system hacking.)
+// If you modify me, keep me synchronized with that file.
+
 #include <c10/macros/Export.h>

-// If you modified DeviceType in caffe2/proto/caffe2.proto, please also sync
-// your changes into torch/headeronly/core/DeviceType.h.
-#include <torch/headeronly/core/DeviceType.h>
-
+#include <cstddef>
+#include <cstdint>
+#include <functional>
 #include <ostream>
 #include <string>

 namespace c10 {

+// These contains all device types that also have a BackendComponent
+// and therefore participate in per-backend functionality dispatch keys.
+// This is most backends except PrivateUse2 and PrivateUse3
+#define C10_FORALL_BACKEND_DEVICE_TYPES(_, extra) \
+  _(CPU, extra)                                   \
+  _(CUDA, extra)                                  \
+  _(HIP, extra)                                   \
+  _(XLA, extra)                                   \
+  _(MPS, extra)                                   \
+  _(IPU, extra)                                   \
+  _(XPU, extra)                                   \
+  _(HPU, extra)                                   \
+  _(VE, extra)                                    \
+  _(Lazy, extra)                                  \
+  _(Meta, extra)                                  \
+  _(MTIA, extra)                                  \
+  _(PrivateUse1, extra)
+
+enum class DeviceType : int8_t {
+  CPU = 0,
+  CUDA = 1, // CUDA.
+  MKLDNN = 2, // Reserved for explicit MKLDNN
+  OPENGL = 3, // OpenGL
+  OPENCL = 4, // OpenCL
+  IDEEP = 5, // IDEEP.
+  HIP = 6, // AMD HIP
+  FPGA = 7, // FPGA
+  MAIA = 8, // ONNX Runtime / Microsoft
+  XLA = 9, // XLA / TPU
+  Vulkan = 10, // Vulkan
+  Metal = 11, // Metal
+  XPU = 12, // XPU
+  MPS = 13, // MPS
+  Meta = 14, // Meta (tensors with no data)
+  HPU = 15, // HPU / HABANA
+  VE = 16, // SX-Aurora / NEC
+  Lazy = 17, // Lazy Tensors
+  IPU = 18, // Graphcore IPU
+  MTIA = 19, // Meta training and inference devices
+  PrivateUse1 = 20, // PrivateUse1 device
+  // NB: If you add more devices:
+  //  - Change the implementations of DeviceTypeName and isValidDeviceType
+  //    in DeviceType.cpp
+  //  - Change the number below
+  COMPILE_TIME_MAX_DEVICE_TYPES = 21,
+};
+
+constexpr DeviceType kCPU = DeviceType::CPU;
+constexpr DeviceType kCUDA = DeviceType::CUDA;
+constexpr DeviceType kHIP = DeviceType::HIP;
+constexpr DeviceType kFPGA = DeviceType::FPGA;
+constexpr DeviceType kMAIA = DeviceType::MAIA;
+constexpr DeviceType kXLA = DeviceType::XLA;
+constexpr DeviceType kMPS = DeviceType::MPS;
+constexpr DeviceType kMeta = DeviceType::Meta;
+constexpr DeviceType kVulkan = DeviceType::Vulkan;
+constexpr DeviceType kMetal = DeviceType::Metal;
+constexpr DeviceType kXPU = DeviceType::XPU;
+constexpr DeviceType kHPU = DeviceType::HPU;
+constexpr DeviceType kVE = DeviceType::VE;
+constexpr DeviceType kLazy = DeviceType::Lazy;
+constexpr DeviceType kIPU = DeviceType::IPU;
+constexpr DeviceType kMTIA = DeviceType::MTIA;
+constexpr DeviceType kPrivateUse1 = DeviceType::PrivateUse1;
+
+// define explicit int constant
+constexpr int COMPILE_TIME_MAX_DEVICE_TYPES =
+    static_cast<int>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
+
+static_assert(
+    COMPILE_TIME_MAX_DEVICE_TYPES <= 21,
+    "Hey!  You seem to be adding a lot of new DeviceTypes.  The intent was "
+    "for this constant to reflect the actual number of DeviceTypes we support "
+    "in PyTorch; it's important that this number is not too large as we "
+    "use this to allocate stack arrays in some places in our code.  If you "
+    "are indeed just adding the 20th device type, feel free to change "
+    "the check to 32; but if you are adding some sort of extensible device "
+    "types registration, please be aware that you are affecting code that "
+    "this number is small.  Try auditing uses of this constant.");
+
 C10_API std::string DeviceTypeName(DeviceType d, bool lower_case = false);

 C10_API bool isValidDeviceType(DeviceType d);
@ -24,6 +108,15 @@ C10_API bool is_privateuse1_backend_registered();

 } // namespace c10

+namespace std {
+template <>
+struct hash<c10::DeviceType> {
+  std::size_t operator()(c10::DeviceType k) const {
+    return std::hash<int>()(static_cast<int>(k));
+  }
+};
+} // namespace std
+
 namespace torch {
 // NOLINTNEXTLINE(misc-unused-using-decls)
 using c10::DeviceType;
--- a/c10/core/impl/HermeticPyObjectTLS.cpp
+++ b/c10/core/impl/HermeticPyObjectTLS.cpp
@ -1,21 +0,0 @@
-#include <c10/core/impl/HermeticPyObjectTLS.h>
-
-namespace c10::impl {
-
-thread_local static std::atomic<bool> hermeticPyObjectState{false};
-
-std::atomic<bool> HermeticPyObjectTLS::haveState_{false};
-
-void HermeticPyObjectTLS::set_state(bool state) {
-  hermeticPyObjectState = state;
-}
-
-bool HermeticPyObjectTLS::get_tls_state() {
-  return hermeticPyObjectState;
-}
-
-void HermeticPyObjectTLS::init_state() {
-  haveState_ = true;
-}
-
-} // namespace c10::impl
--- a/c10/core/impl/HermeticPyObjectTLS.h
+++ b/c10/core/impl/HermeticPyObjectTLS.h
@ -1,62 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-#include <atomic>
-
-namespace c10::impl {
-
-// This TLS controls whether or not we permanently associate PyObject
-// with Tensor the first time it is allocated.  When hermetic PyObject
-// TLS is enabled (state is true), we DO NOT save PyObjects to Tensor,
-// meaning you get a distinct PyObject whenever you execute the code in
-// question.
-struct C10_API HermeticPyObjectTLS {
-  static void set_state(bool state);
-  static bool get_state() {
-    // Hypothetical fastpath if torchdeploy/multipy // codespell:ignore multipy
-    // isn't used. Per
-    // https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
-    // this qualifies relaxed access because it is a single-location data
-    // structure (only the boolean here).
-    //
-    // Forgetting about data races for a moment, is there a logical race?
-    //
-    //  - Boolean only ever transitions from false to true.  So the
-    //    critical situation is when one interpreter is already running
-    //    when a second interpreter switches haveState from false to true.
-    //
-    //  - The first interpreter is indifferent whether or not it sees
-    //    hasState true/false; obviously false works (this is what the
-    //    interpreter was previously using; more directly, the interpreter
-    //    calls into itself as the handler, so being hermetic is not
-    //    required), and true simply means serviced python operator calls will
-    //    be hermetic; in these cases it is expected to be functionally
-    //    equivalent.
-    //
-    //  - The second interpreter MUST see hasState true (as its requests will
-    //    be forwarded to the first interpreter), but it is assumed that there
-    //    is a synchronization between the interpreter initialization, and
-    //    when we actually perform operations, so it is guaranteed to see
-    //    hasState true.
-    //
-    // QED.
-    //
-    // This fastpath is currently disabled so that we can more easily test that
-    // hermetic mode works correctly even on stock build of PyTorch.
-    if (false && !haveState_.load(std::memory_order_relaxed))
-      return false;
-    return get_tls_state();
-  }
-  // Call this from the multipy/torchdeploy // codespell:ignore multipy
-  // top level
-  static void init_state();
-
- private:
-  // This only flipped once from false to true during
-  // torchdeploy/multipy initialization, // codespell:ignore multipy
-  // and never again.
-  static std::atomic<bool> haveState_;
-  static bool get_tls_state();
-};
-
-} // namespace c10::impl
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@ -1,6 +1,5 @@
 #pragma once

-#include <c10/core/impl/HermeticPyObjectTLS.h>
 #include <c10/core/impl/PyInterpreter.h>
 #include <c10/core/impl/PyInterpreterHooks.h>
 #include <c10/util/python_stub.h>
@ -42,32 +41,15 @@ struct C10_API PyObjectSlot {

  PyObject* _unchecked_untagged_pyobj() const;

-  // Test the interpreter tag.  If tagged for the current interpreter, return
-  // a non-nullopt (but possibly null) PyObject.  If (possibly) untagged,
-  // returns a nullopt.  If it is definitely invalid, raises an error.
-  //
-  // If `ignore_hermetic_tls` is false and this function is called from a
-  // hermetic context (ie, `HermeticPyObjectTLS::get_state()` is true), then
-  // nullopt is returned. If `ignore_hermetic_tls` is true, then the hermetic
-  // context is ignored, allowing you to check the interpreter tag of a
-  // nonhermetic PyObject from within a hermetic context. This is necessary
-  // because there are some cases where the deallocator function of a
-  // nonhermetic PyObject is called from within a hermetic context, so it must
-  // be properly treated as a nonhermetic PyObject.
-  //
+  // Test the interpreter / PyObj as they may be null
  // NB: this lives in header so that we can avoid actually creating the
  // std::optional

-  // @todo alban: I'm not too sure what's going on here, we can probably delete
-  // it but it's worthwhile making sure
  std::optional<PyObject*> check_pyobj() const {
    impl::PyInterpreter* interpreter = getGlobalPyInterpreter();
    if (interpreter == nullptr || pyobj_ == nullptr) {
      return std::nullopt;
    }
-    if (c10::impl::HermeticPyObjectTLS::get_state()) {
-      return std::nullopt;
-    }
    return _unchecked_untagged_pyobj();
  }

--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -1183,16 +1183,6 @@ class DeviceCachingAllocator {
  //     ends.
  ska::flat_hash_map<Block*, std::vector<cudaGraphNode_t>> deferred_blocks;

-  // Incremental reverse-traversal state cached per graph.
-  // We never re-traverse nodes we've already seen
-  struct GraphReuseContext {
-    ska::flat_hash_map<cudaStream_t, ska::flat_hash_set<cudaGraphNode_t>>
-        visited;
-  };
-  ska::flat_hash_map<MempoolId_t, CaptureId_t, MempoolIdHash>
-      mempool_to_capture_id;
-  ska::flat_hash_map<CaptureId_t, GraphReuseContext> graph_reuse_context;
-
  // outstanding cuda events
  ska::flat_hash_map<
      cuda::CUDAStream,
@ -1648,70 +1638,44 @@ class DeviceCachingAllocator {
    return block;
  }

-  struct CaptureInfo {
-    cudaGraph_t graph{};
-    CaptureId_t capture_id{0};
-    const cudaGraphNode_t* terminals{nullptr};
-    size_t num_terminals{0};
-    cudaStreamCaptureStatus status{cudaStreamCaptureStatusNone};
-  };
-
-  inline CaptureInfo stream_get_capture_info(cudaStream_t stream) {
-    CaptureInfo info{};
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
-    C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
-        stream,
-        &info.status,
-        &info.capture_id,
-        &info.graph,
-        &info.terminals,
-        nullptr,
-        &info.num_terminals));
-#else
-    C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
-        stream,
-        &info.status,
-        &info.capture_id,
-        &info.graph,
-        &info.terminals,
-        &info.num_terminals));
-#endif
-    TORCH_INTERNAL_ASSERT(
-        info.status != cudaStreamCaptureStatusInvalidated,
-        "Invalid stream capture status");
-
-    return info;
-  }
-
-  // Record "free marker" of the CUDA graph for all streams that
+  // Insert "free marker" (empty nodes) into the CUDA graph for all streams that
  // have used the block, including the allocation stream. These nodes mark the
  // last use of the block in the capture graph. Returns a vector of the
  // inserted nodes, or an empty vector if any stream is not capturing.
-  std::vector<cudaGraphNode_t> record_free_markers(Block* block) {
-    // Is is possible to have the same marker recorded multiple times, so we use
-    // a set to avoid duplicates
-    ska::flat_hash_set<cudaGraphNode_t> markers;
-    cudaGraph_t owning_graph = nullptr;
+  std::vector<cudaGraphNode_t> insert_free_marker(Block* block) {
+    std::vector<cudaGraphNode_t> empty_nodes;

-    auto try_record = [&](cudaStream_t s) -> bool {
-      auto info = stream_get_capture_info(s);
-      if (info.status == cudaStreamCaptureStatusNone) {
-        return false; // not capturing on this stream -> must defer
-      }
+    auto try_add_empty_node = [&](cudaStream_t stream) -> bool {
+      cudaStreamCaptureStatus status{};
+      cudaGraph_t graph{};
+      const cudaGraphNode_t* deps = nullptr;
+      size_t num_deps = 0;
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
+      C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
+          stream, &status, nullptr, &graph, &deps, nullptr, &num_deps));
+#else
+      C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
+          stream, &status, nullptr, &graph, &deps, &num_deps));
+#endif

-      if (owning_graph == nullptr) {
-        owning_graph = info.graph;
-      }
      TORCH_INTERNAL_ASSERT(
-          info.graph == owning_graph,
-          "All streams in the same capture should agree on the graph");
+          status != cudaStreamCaptureStatusInvalidated,
+          "Invalid stream capture status");

-      // Use current terminals as the free markers for the stream
-      for (size_t i = 0; i < info.num_terminals; ++i) {
-        auto terminal = info.terminals[i];
-        markers.insert(terminal);
+      if (status == cudaStreamCaptureStatusNone) {
+        return false;
      }
-      owning_graph = info.graph; // all streams in the same capture should agree
+
+      cudaGraphNode_t node{};
+      C10_CUDA_CHECK(cudaGraphAddEmptyNode(&node, graph, deps, num_deps));
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
+      C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies(
+          stream, &node, nullptr, 1, cudaStreamSetCaptureDependencies));
+#else
+      C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies(
+          stream, &node, 1, cudaStreamSetCaptureDependencies));
+#endif
+      empty_nodes.push_back(node);
      return true;
    };

@ -1719,34 +1683,81 @@ class DeviceCachingAllocator {
    // An empty vector indicates that the block should be deferred for freeing
    // until after capture.

-    // Allocation stream
-    if (!try_record(block->stream)) {
+    // Attempt to add an empty node for the allocation stream.
+    if (!try_add_empty_node(block->stream)) {
      return {};
    }
-    // Any extra streams that used this block
+    // Attempt to add empty nodes for all streams that have used the block.
    for (const auto& s : block->stream_uses) {
-      if (!try_record(s.stream())) {
+      if (!try_add_empty_node(s.stream())) {
        return {};
      }
    }
-    return std::vector<cudaGraphNode_t>(markers.begin(), markers.end());
+    return empty_nodes;
  }

-  // Returns the set of "reusable" free markers in the current
+  // Returns the current set of "terminal" nodes in the CUDA graph for a given
+  // stream. These represent the current endpoints of the stream, and may
+  // include additional nodes if the graph branches. Any new work captured will
+  // be attached after one or more of these terminals.
+  std::vector<cudaGraphNode_t> get_terminals(cudaStream_t stream) {
+    std::vector<cudaGraphNode_t> result;
+
+    cudaStreamCaptureStatus status{};
+    cudaGraph_t graph{};
+    const cudaGraphNode_t* dependencies = nullptr;
+    size_t num_dependencies = 0;
+
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
+    C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
+        stream,
+        &status,
+        nullptr,
+        &graph,
+        &dependencies,
+        nullptr,
+        &num_dependencies));
+#else
+    C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
+        stream, &status, nullptr, &graph, &dependencies, &num_dependencies));
+#endif
+
+    TORCH_INTERNAL_ASSERT(
+        status == cudaStreamCaptureStatusActive,
+        "Invalid stream capture status");
+
+    for (size_t i = 0; i < num_dependencies; i++) {
+      auto node = dependencies[i];
+      if (node != nullptr) {
+        result.push_back(node);
+      }
+    }
+
+    return result;
+  }
+
+  // Returns the set of "reusable" free markers (empty nodes) in the current
  // CUDA graph capture. A free marker is considered reusable if it is a
  // predecessor of every terminal node.
  // This ensures that all future captured work will occur after the free
  // marker, making it safe to reuse.
-  void update_visited(
-      const CaptureInfo& info,
-      ska::flat_hash_set<cudaGraphNode_t>& visited) {
-    // This is the versioned cudaGraphNodeGetDependencies helper function.
-    auto node_get_dependencies =
-        [](cudaGraphNode_t n, cudaGraphNode_t* deps, size_t* count) -> void {
+  ska::flat_hash_set<cudaGraphNode_t> get_reusable_empty_nodes(
+      cudaStream_t stream) {
+    auto terminals = get_terminals(stream);
+    if (terminals.empty()) {
+      // No terminal nodes found; nothing to free.
+      return {};
+    }
+
+    auto get_dependencies = [](cudaGraphNode_t node,
+                               cudaGraphNode_t* pDependencies,
+                               size_t* pNumDependencies) -> void {
 #if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
-      C10_CUDA_CHECK(cudaGraphNodeGetDependencies(n, deps, nullptr, count));
+      C10_CUDA_CHECK(cudaGraphNodeGetDependencies(
+          node, pDependencies, nullptr, pNumDependencies));
 #else
-      C10_CUDA_CHECK(cudaGraphNodeGetDependencies(n, deps, count));
+      C10_CUDA_CHECK(
+          cudaGraphNodeGetDependencies(node, pDependencies, pNumDependencies));
 #endif
    };

@ -1754,43 +1765,62 @@ class DeviceCachingAllocator {
    auto get_parents =
        [&](cudaGraphNode_t node) -> std::vector<cudaGraphNode_t> {
      size_t count = 0;
-
-      node_get_dependencies(node, nullptr, &count);
+      get_dependencies(node, nullptr, &count);
      std::vector<cudaGraphNode_t> out(count);
      if (count) {
-        node_get_dependencies(node, out.data(), &count);
+        get_dependencies(node, out.data(), &count);
        out.resize(count);
      }
      return out;
    };

-    // For each terminal node, perform a reverse DFS to count, for each free
-    // marker, how many terminals it can reach (i.e., for how many terminals it
-    // is a predecessor). A free marker is reusable if it is a predecessor of
-    // all terminal nodes.
-    std::deque<cudaGraphNode_t> dfs;
-    for (size_t i = 0; i < info.num_terminals; ++i) {
-      dfs.push_back(info.terminals[i]);
+    // Helper to determine if a node is an empty node (used as a free marker).
+    auto is_empty_node = [](cudaGraphNode_t n) -> bool {
+      cudaGraphNodeType type{};
+      C10_CUDA_CHECK(cudaGraphNodeGetType(n, &type));
+      return type == cudaGraphNodeTypeEmpty;
+    };
+
+    // For each terminal node, perform a reverse DFS to count, for each empty
+    // node, how many terminals it can reach (i.e., for how many terminals it is
+    // a predecessor). An empty node is reusable if it is a predecessor of all
+    // terminal nodes.
+    ska::flat_hash_map<cudaGraphNode_t, size_t> num_terminals_reachable;
+
+    for (auto terminal : terminals) {
+      ska::flat_hash_set<cudaGraphNode_t> visited;
+      ska::flat_hash_set<cudaGraphNode_t> empty_nodes;
+
+      std::function<void(cudaGraphNode_t)> reverse_dfs =
+          [&](cudaGraphNode_t node) {
+            if (!visited.insert(node).second)
+              return;
+
+            if (is_empty_node(node)) {
+              num_terminals_reachable[node]++;
+              empty_nodes.insert(node);
+            }
+            auto parents = get_parents(node);
+            for (auto p : parents) {
+              reverse_dfs(p);
+            }
+          };
+
+      reverse_dfs(terminal);
    }

-    while (!dfs.empty()) {
-      auto v = dfs.back();
-      dfs.pop_back();
-
-      if (visited.count(v)) {
-        continue;
-      }
-      visited.insert(v);
-
-      auto parents = get_parents(v);
-      for (auto p : parents) {
-        dfs.push_back(p);
+    ska::flat_hash_set<cudaGraphNode_t> reusable_empty_nodes;
+    for (auto [node, count] : num_terminals_reachable) {
+      if (count == terminals.size()) {
+        reusable_empty_nodes.insert(node);
      }
    }
+
+    return reusable_empty_nodes;
  }

  // A block is considered reusable during CUDA graph capture if every free
-  // marker associated with the block is a predecessor of every
+  // marker (empty node) associated with the block is a predecessor of every
  // terminal node.
  //
  // This ensures that any new operation added to the graph will be attached
@ -1799,52 +1829,36 @@ class DeviceCachingAllocator {
  // on every stream, so the block's previous lifetime ends before any new
  // lifetime begins. This check relies solely on the DAG topology and does not
  // require event queries, making it safe to use during capture.
+  //
+  // This function iterates over all deferred blocks, determines if their empty
+  // nodes are reusable according to the above criteria, and frees the block if
+  // so.
  void free_safe_blocks_in_capture(
      const std::shared_ptr<GatheredContext>& context,
      cudaStream_t stream) {
-    auto info = stream_get_capture_info(stream);
+    auto reusable_empty_nodes = get_reusable_empty_nodes(stream);

    // If there are no reusable empty nodes (e.g., not currently capturing),
    // there is nothing to do.
-    if (info.status == cudaStreamCaptureStatusNone || info.num_terminals == 0) {
+    if (reusable_empty_nodes.empty()) {
      return;
    }
-    if (graph_reuse_context.find(info.capture_id) ==
-        graph_reuse_context.end()) {
-      bool found = false;
-      for (auto& entry : captures_underway) {
-        if (entry.second(stream)) {
-          auto graph_pool = graph_pools.find(entry.first);
-          TORCH_INTERNAL_ASSERT(
-              graph_pool != graph_pools.end(),
-              "Could not find graph pool for capture.");
-          auto mempool_id = graph_pool->first;
-          graph_reuse_context[info.capture_id] = GraphReuseContext{};
-          mempool_to_capture_id[mempool_id] = info.capture_id;
-          found = true;
-          break;
-        }
-      }
-      TORCH_INTERNAL_ASSERT(
-          found, "Could not find memory pool id for capture.");
-    }
-    auto& graph_context = graph_reuse_context[info.capture_id];
-    auto& visited = graph_context.visited[stream];
-    update_visited(info, visited);

    std::vector<Block*> blocks_to_erase;
-    for (auto& [block, markers] : deferred_blocks) {
-      // Skip this block if it has no markers, as we defer its freeing until
+
+    for (auto& [block, inserted_empty_nodes] : deferred_blocks) {
+      // Skip this block if it has no empty nodes, as we defer its freeing until
      // after graph capture. Also skip if the block was not allocated on the
      // current stream; such blocks will be freed when
      // free_safe_blocks_in_capture is attempted on that stream.
-      if (markers.empty() || block->stream != stream) {
+      if (inserted_empty_nodes.empty() || block->stream != stream) {
        continue;
      }

      bool is_reusable = true;
-      for (auto m : markers) {
-        if (!visited.count(m)) {
+
+      for (const auto& node : inserted_empty_nodes) {
+        if (reusable_empty_nodes.find(node) == reusable_empty_nodes.end()) {
          is_reusable = false;
          break;
        }
@ -1905,11 +1919,11 @@ class DeviceCachingAllocator {
    if (!block->stream_uses.empty()) {
      if (C10_UNLIKELY(!captures_underway.empty())) {
        if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) {
-          // record_free_markers returns a vector of free markers,
+          // insert_free_marker returns a vector of free markers,
          // or an empty vector if any associated stream is not currently
          // capturing. The empty vector means that we will defer the free until
          // capture is finished.
-          deferred_blocks.emplace(block, record_free_markers(block));
+          deferred_blocks.emplace(block, insert_free_marker(block));
        } else {
          // If graph_capture_record_stream_reuse is not enabled, always defer
          // the free until capture is finished.
@ -2497,21 +2511,6 @@ class DeviceCachingAllocator {
  // Called by CUDAGraph::capture_end
  void endAllocateToPool(MempoolId_t mempool_id) {
    std::lock_guard<std::recursive_mutex> lock(mutex);
-
-    if (CUDAAllocatorConfig::graph_capture_record_stream_reuse() &&
-        !graph_reuse_context.empty()) {
-      auto capture_id = mempool_to_capture_id[mempool_id];
-      auto graph_context = graph_reuse_context[capture_id];
-      for (auto& [stream, _] : graph_context.visited) {
-        TORCH_INTERNAL_ASSERT(
-            stream_get_capture_info(stream).status ==
-                cudaStreamCaptureStatusNone,
-            "This stream should not be capturing when the capture is ended");
-      }
-      graph_reuse_context.erase(capture_id);
-      mempool_to_capture_id.erase(mempool_id);
-    }
-
    for (auto it = captures_underway.begin(); it != captures_underway.end();
         ++it) {
      if (it->first == mempool_id) {
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -1552,12 +1552,6 @@ endif()
 #
 # End ATen checks
 #
-
-# Install `fmtlib` header.
-# This was the default behavior before version 12.0.0.
-# Since PyTorch C API depends on it, make it available for projects that
-# depend on PyTorch.
-set(FMT_INSTALL ON)
 set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE)
 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -138,7 +138,6 @@ function(caffe2_print_configuration_summary)
  message(STATUS "  USE_EIGEN_FOR_BLAS    : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
  message(STATUS "  USE_EIGEN_FOR_SPARSE  : ${USE_EIGEN_SPARSE}")
  message(STATUS "  USE_FBGEMM            : ${USE_FBGEMM}")
-  message(STATUS "  USE_FBGEMM_GENAI      : ${USE_FBGEMM_GENAI}")
  message(STATUS "  USE_KINETO            : ${USE_KINETO}")
  message(STATUS "  USE_GFLAGS            : ${USE_GFLAGS}")
  message(STATUS "  USE_GLOG              : ${USE_GLOG}")
--- a/cmake/prioritized_text.txt
+++ b/cmake/prioritized_text.txt
@ -153,7 +153,6 @@ _ZN3c104impl12PyObjectSlot10owns_pyobjEv
 _ZN3c104impl12PyObjectSlot19maybe_destroy_pyobjEv
 _ZN3c104impl12PyObjectSlotC1Ev
 _ZN3c104impl12PyObjectSlotD2Ev
-_ZN3c104impl19HermeticPyObjectTLS13get_tls_stateEv
 _ZN3c104impl20TorchDispatchModeTLS13any_modes_setEb
 _ZN3c104impl23ExcludeDispatchKeyGuardC1ENS_14DispatchKeySetE
 _ZN3c104impl23ExcludeDispatchKeyGuardD2Ev
--- a/docs/source/_static/img/aoti_debug_printer/after_launch.png
+++ b/docs/source/_static/img/aoti_debug_printer/after_launch.png
--- a/docs/source/_static/img/aoti_debug_printer/before_launch.png
+++ b/docs/source/_static/img/aoti_debug_printer/before_launch.png
--- a/docs/source/accelerator/amp.md
+++ b/docs/source/accelerator/amp.md
@ -1,72 +0,0 @@
-# Automatic Mixed Precision
-
-## Background
-
-Automatic Mixed Precision (AMP) enables the use of both single precision (32-bit) and half precision (16-bit) floating point types during training or inference.
-
-Key components include:
-
- [**Autocast**](https://docs.pytorch.org/docs/stable/amp.html#autocasting): Automatically casts operations to lower-precision (e.g., float16 or bfloat16) to improve performance while maintaining accuracy.
- [**Gradient Scaling**](https://docs.pytorch.org/docs/stable/amp.html#gradient-scaling): Dynamically scales gradients during backpropagation to prevent underflow when training with mixed precision.
-
-## Design
-
-### Casting Strategy
-
-The [`CastPolicy`](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/autocast_mode.h#L416-L438) is used to define type conversion rules. Each enum value represents a set of type conversion requirements for a group of operators, ensuring consistent handling of operations that prioritize either precision or performance.
-
-| Policy                   | Explanation                                                                          |
-| :---                     | :---                                                                                 |
-| **`lower_precision_fp`** | Cast all inputs to `lower_precision_fp` before execute the op.                       |
-| **`fp32`**               | Cast all inputs to `at::kFloat` before running the op.                               |
-| **`fp32_set_opt_dtype`** | Execution in `at::kFloat`, while respecting user-specified output dtype if provided. |
-| **`fp32_append_dtype`**  | Append at::kFloat to the args and redispatch to the type-aware overload              |
-| **`promote`**            | Promote all inputs to the “widest” dtype before execution.                           |
-
-### Operators Lists
-
-PyTorch defines a general list of operators for each of casting strategies mentioned above, as a reference for developers of new accelerators.
-
-| Policy                   | Operators List                                                                                    |
-| :---                     | :---                                                                                              |
-| **`lower_precision_fp`** | [List Link](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/autocast_mode.h#L819-L852) |
-| **`fp32`**               | [List Link](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/autocast_mode.h#L854-L912) |
-| **`fp32_set_opt_dtype`** | [List Link](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/autocast_mode.h#L914-L931) |
-| **`fp32_append_dtype`**  | [List Link](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/autocast_mode.h#L933-L958) |
-| **`promote`**            | [List Link](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/autocast_mode.h#L960-L971) |
-
-## Implementation
-
-### Python Integration
-
-Implement the `get_amp_supported_dtype` method to return the data types supported by the new accelerator in the AMP context.
-
-```{eval-rst}
-.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/amp/__init__.py
-    :language: python
-    :start-after: LITERALINCLUDE START: AMP GET_SUPPORTED_DTYPE
-    :end-before: LITERALINCLUDE END: AMP GET_SUPPORTED_DTYPE
-    :linenos:
-```
-
-### C++ Integration
-
-This section shows how AMP registers autocast kernels for the `AutocastPrivateUse1` dispatch key.
-
- Register a fallback that makes unhandled ops fall through to their normal implementations.
- Register specific aten kernels under `AutocastPrivateUse1` using the `KERNEL_PRIVATEUSEONE` helper macro, which maps an op to the desired precision implementation (with enum `at::autocast::CastPolicy`)
-
-```{eval-rst}
-.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/amp/autocast_mode.cpp
-    :language: c++
-    :start-after: LITERALINCLUDE START: AMP FALLTHROUTH
-    :end-before: LITERALINCLUDE END: AMP FALLTHROUTH
-    :linenos:
-
-.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/amp/autocast_mode.cpp
-    :language: c++
-    :start-after: LITERALINCLUDE START: AMP IMPL
-    :end-before: LITERALINCLUDE END: AMP IMPL
-    :emphasize-lines: 3,6,8-10
-    :linenos:
-```
--- a/docs/source/accelerator/index.md
+++ b/docs/source/accelerator/index.md
@ -44,7 +44,6 @@ Next, we will delve into each chapter of this guide. Each chapter focuses on a k

 autoload
 operators
-amp
 ```

 [OpenReg URL]: https://github.com/pytorch/pytorch/tree/main/test/cpp_extensions/open_registration_extension/torch_openreg "OpenReg URL"
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@ -339,16 +339,13 @@ XLA
 ~~~

 -  Jack Cao (`JackCaoG <https://github.com/JackCaoG>`__)
-  Han Qi (`qihqi <https://github.com/qihqi>`__)
-  Yifei Teng (`tengyifei <https://github.com/tengyifei>`__)
-  Siyuan Liu (`lsy323 <https://github.com/lsy323>`__)
+-  Daniel Sohn (`jysohn23 <https://github.com/jysohn23>`__)
+-  Zach Cain (`zcain117 <https://github.com/zcain117>`__)
 -  Brian Hirsh (`bdhirsh <https://github.com/bdhirsh>`__)
-  (emeritus) Gregory Chanan (`gchanan <https://github.com/gchanan>`__)
+-  Gregory Chanan (`gchanan <https://github.com/gchanan>`__)
 -  (emeritus) Ailing Zhang (`ailzhang <https://github.com/ailzhang>`__)
 -  (emeritus) Davide Libenzi (`dlibenzi <https://github.com/dlibenzi>`__)
 -  (emeritus) Alex Suhan (`asuhan <https://github.com/asuhan>`__)
-  (emeritus) Daniel Sohn (`jysohn23 <https://github.com/jysohn23>`__)
-  (emeritus) Zach Cain (`zcain117 <https://github.com/zcain117>`__)

 TorchServe
 ~~~~~~~~~~
--- a/docs/source/notes/broadcasting.rst
+++ b/docs/source/notes/broadcasting.rst
@ -13,6 +13,7 @@ General semantics
 -----------------
 Two tensors are "broadcastable" if the following rules hold:

+- Each tensor has at least one dimension.
 - When iterating over the dimension sizes, starting at the trailing dimension,
  the dimension sizes must either be equal, one of them is 1, or one of them
  does not exist.
@ -25,8 +26,7 @@ For Example::

    >>> x=torch.empty((0,))
    >>> y=torch.empty(2,2)
-    # x and y are not broadcastable, because the 0-sized dimension of x
-    # does not match the 2-sized dimension of y.
+    # x and y are not broadcastable, because x does not have at least 1 dimension

    # can line up trailing dimensions
    >>> x=torch.empty(5,3,4,1)
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@ -613,7 +613,8 @@ Available options:
  CUDA Graph capture by using the graph topology (instead of CUDA events) to determine
  when a freed block is safe to reuse. This can reduce peak memory during long captures that free
  and reallocate buffers across multiple streams, especially when the capture DAG frequently
-  reaches joined frontiers.
+  reaches joined frontiers. Note: Enabling this option can significantly increase the time spent
+  capturing the graph.

 .. note::

--- a/docs/source/torch.compiler_dynamic_shapes.md
+++ b/docs/source/torch.compiler_dynamic_shapes.md
@ -216,7 +216,7 @@ you never specialize.

 #### `mark_unbacked(tensor, dim)`

-The {func}`torch._dynamo.decorators.mark_unbacked` function marks a tensor dimension as unbacked. It is unlikely
+The {func}`torch._dynamo.mark_unbacked` function marks a tensor dimension as unbacked. It is unlikely
 to be the tool you need, but it could be useful if the specialization occurs inside
 a condition `guard_size_oblivious(x)`, and if using it removes the specialization.
 Ensure it fixes the specialization and does not introduce a data-dependent error
--- a/docs/source/torch.intermediate_debug_printer.md
+++ b/docs/source/torch.intermediate_debug_printer.md
@ -1,83 +0,0 @@
-```{eval-rst}
-:orphan:
-```
-
-# AOTInductor Intermediate Value Debug Printer
-
-This is a user manual on how to use AOT Inductor Intermediate Value Debug Printer tool which is a utility tool that can help pinpoint CUDA IMA kernels / numerical discrepancies when uses AOT Inductor to compile a PyTorch model.
-
-The main functionality of this tool is to automatically print out / or dump the value info of all intermediate tensor arguments before and after each kernel launch call in AOT Inductor.
-
-## How to use
-
-The debug printer can be configured via environment variable. The following flags are both supported to run with internal fbcode buck commands and OSS.
-
-All configurations are defined here: [torch/_inductor/config.py](https://github.com/pytorch/pytorch/blob/768361e67f0eb36491d7b763ef38d7c928ebefe6/torch/_inductor/config.py#L1493-L1505)
-
-
-```
-    # options for debug printing/saving for intermediate tensor values for aot inductor
-
-    0: disable debug dumping
-    1: enable saving intermediate tensor values
-    2: enable printing intermediate tensor values
-    3: enable printing kernel names only (useful for pinpointing troublesome kernels)
-```
-
-
-1. To enable **default** mode debug printing:
-
-    - Add flag `AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2` (PRINT_ONLY mode) for default printing all supported kernel tensor arg values.
-
-    - Add flag `AOT_INDUCTOR_FILTERED_KERNELS_TO_PRINT={kernel_name_1, kernel_name_2,...}` for selectively printing tensor values associated with the specified kernels. (suggest to do a run with generating full printing logs first)
-
-    Sample command:
-
-    ```
-    AOT_INDUCTOR_FILTERED_KERNELS_TO_PRINT="aoti_torch_cuda_addmm_out" AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2 TORCH_LOGS="+inductor, output_code" python test/inductor/test_aot_inductor.py -k test_addmm_cuda
-    ```
-
-
-2. To enable **pinpoint** the problematic kernel name only: (Especially useful in CUDA IMA debugging)
-
-   - Add flag `AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=3` (PRINT_KERNEL_NAME_ONLY mode) no tensor numerical values will be dumped.
-
-   Sample command:
-
-   ```
-   AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=3 TORCH_LOGS="+inductor, output_code" python test/inductor/test_aot_inductor.py -k test_addmm_cuda
-   ```
-
-3. To enable **save** the intermediate tensor values:
-
-    - Useful when you want to repro the error in a standalone kernel debugging repro. The saved intermediate tensor values can be used as debugging inputs to the problematic kernel.
-    - Set `AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=1` (SAVE_ONLY mode)  for default saving all supported kernel tensor arg values to `.pt` in a tmp folder.
-    - Similarly, add `AOT_INDUCTOR_FILTERED_KERNELS_TO_PRINT={kernel_name_1, kernel_name_2,...}` for selectively saving tensor values associated with the specified kernels.
-
-    Sample command:
-    ```
-    AOT_INDUCTOR_FILTERED_KERNELS_TO_PRINT="triton_poi_fused_0" AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=1 TORCH_LOGS="+inductor, output_code" python test/inductor/test_aot_inductor.py -k test_addmm_cuda
-    ```
-
-    The saved tensor values will be dumped in a format:  `<before/after_launch>_<kernel_name>_<arg_name>_<device>.pt`
-
-    The dumped `.pt` tensors can be further loaded and used like this:
-    ```
-        def _load_tensor(path):
-            return torch.load(path, weights_only=True)
-        tensor = _load_tensor("../tmp/aoti_torch/before_launch_aoti_torch_cuda_addmm_out_buf1_cuda:0.pt")
-
-        # Simply print tensor to view the full value
-        print(tensor)
-    ```
-
-## Example Outputs
-
-Before launch tensor stats:
-
-![Sample image 1](_static/img/aoti_debug_printer/before_launch.png)
-
-
-After launch tensor stats:
-
-![Sample image 2](_static/img/aoti_debug_printer/after_launch.png)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@
 requires = [
    # 70.1.0: min version for integrated bdist_wheel command from wheel package
    # 77.0.0: min version for SPDX expression support for project.license
-    "setuptools>=70.1.0",
+    "setuptools>=70.1.0,<80.0",
    "cmake>=3.27",
    "ninja",
    "numpy",
--- a/requirements-build.txt
+++ b/requirements-build.txt
@ -1,5 +1,5 @@
 # Build System requirements
-setuptools>=70.1.0
+setuptools>=70.1.0,<80.0  # setuptools develop deprecated on 80.0
 cmake>=3.27
 ninja
 numpy
--- a/Show More
+++ b/Show More