Fix compile

Update aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
Update aten/src/ATen/cpu/vec/sve/vec_float.h
2025-10-26 16:44:54 +08:00 · 2025-08-19 12:16:29 +00:00 · 2025-08-18 14:43:19 +01:00 · 2025-08-18 13:14:40 +01:00 · 2025-08-18 13:11:16 +01:00 · 2025-08-15 15:47:00 +00:00
842 changed files with 52808 additions and 25868 deletions
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -438,7 +438,9 @@ def build_torchvision(
        )
        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+        build_vars += (
+            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
+        )
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

@ -493,7 +495,9 @@ def build_torchdata(
        )
        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+        build_vars += (
+            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
+        )
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

@ -549,7 +553,9 @@ def build_torchtext(
        )
        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+        build_vars += (
+            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
+        )
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

@ -607,7 +613,9 @@ def build_torchaudio(
        )
        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+        build_vars += (
+            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
+        )
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -144,6 +144,16 @@ case "$tag" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
+    CUDA_VERSION=12.6.3
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.12
@ -154,6 +164,39 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    ;;
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6
+    ANACONDA_PYTHON_VERSION=3.13
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.10
@ -176,7 +219,19 @@ case "$tag" in
    VISION=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
+  pytorch-linux-jammy-py3.11-clang12)
+    ANACONDA_PYTHON_VERSION=3.11
+    CLANG_VERSION=12
+    VISION=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-py3.9-gcc9)
+    ANACONDA_PYTHON_VERSION=3.9
+    GCC_VERSION=9
+    VISION=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
    if [[ $tag =~ "jammy" ]]; then
      ANACONDA_PYTHON_VERSION=3.10
    else
@ -190,9 +245,7 @@ case "$tag" in
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
-    if [[ $tag =~ "benchmarks" ]]; then
-      INDUCTOR_BENCHMARKS=yes
-    fi
+    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-noble-rocm-alpha-py3)
    ANACONDA_PYTHON_VERSION=3.12
@ -204,6 +257,7 @@ case "$tag" in
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    INDUCTOR_BENCHMARKS=yes
    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
    ;;
  pytorch-linux-jammy-xpu-2025.0-py3)
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-f7888497a1eb9e98d4c07537f0d0bcfe180d1363
+11ec6354315768a85da41032535e3b7b99c5f706
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -66,9 +66,8 @@ function do_cpython_build {
        ln -s pip3 ${prefix}/bin/pip
    fi
    # install setuptools since python 3.12 is required to use distutils
-    # packaging is needed to create symlink since wheel no longer provides needed information
-    ${prefix}/bin/pip install packaging==25.0 wheel==0.45.1 setuptools==80.9.0
-    local abi_tag=$(${prefix}/bin/python -c "from packaging.tags import interpreter_name, interpreter_version; import sysconfig ; from sysconfig import get_config_var; print('{0}{1}-{0}{1}{2}'.format(interpreter_name(), interpreter_version(), 't' if sysconfig.get_config_var('Py_GIL_DISABLED') else ''))")
+    ${prefix}/bin/pip install wheel==0.45.1 setuptools==80.9.0
+    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
    ln -sf ${prefix} /opt/python/${abi_tag}
 }

--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -68,8 +68,8 @@ function install_nvshmem {
  # download, unpack, install
  wget -q "${url}"
  tar xf "${filename}.tar.gz"
-  cp -a "libnvshmem/include/"* /usr/local/cuda/include/
-  cp -a "libnvshmem/lib/"*     /usr/local/cuda/lib64/
+  cp -a "libnvshmem/include/"* /usr/local/include/
+  cp -a "libnvshmem/lib/"*     /usr/local/lib/

  # cleanup
  cd ..
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -15,37 +15,11 @@ function install_timm() {
  commit=$(get_pinned_commit timm)

  pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
-}
-
-function install_torchbench() {
-  local commit
-  commit=$(get_pinned_commit torchbench)
-  git clone https://github.com/pytorch/benchmark torchbench
-  pushd torchbench
-  git checkout "$commit"
-
-  python install.py --continue_on_fail
-
-  # TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
-  # is regressing speedup metric. This needs to be investigated further
-  pip install transformers==4.38.1
-
-  echo "Print all dependencies after TorchBench is installed"
-  python -mpip freeze
-  popd
-
-  chown -R jenkins torchbench
+  # Clean up
+  conda_run pip uninstall -y torch torchvision triton
 }

 # Pango is needed for weasyprint which is needed for doctr
 conda_install pango
-
-# Stable packages are ok here, just to satisfy TorchBench check
-pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
-
-install_torchbench
 install_huggingface
 install_timm
-
-# Clean up
-conda_run pip uninstall -y torch torchvision torchaudio triton torchao
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -103,5 +103,5 @@ fi
 # It depends on torch and triton. We don't want to install
 # triton and torch from production on Docker CI images
 if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
-  pip_install helion --no-deps
+  pip_install helion==0.0.10 --no-deps
 fi
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -34,27 +34,18 @@ function install_ubuntu() {

    # The xpu-smi packages
    apt-get install -y flex bison xpu-smi
-
-    if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
-        # Compute and Media Runtimes
-        apt-get install -y \
-            intel-opencl-icd intel-level-zero-gpu level-zero \
-            intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
-            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
-            libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
-        # Development Packages
-        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
-    else # rolling driver
-        apt-get install -y \
-            intel-opencl-icd libze-intel-gpu1 libze1 \
-            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
-            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
-            libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
-        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
+    # Compute and Media Runtimes
+    apt-get install -y \
+        intel-opencl-icd intel-level-zero-gpu level-zero \
+        intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
+        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+        libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
+        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
+    if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
+        apt-get install -y intel-ocloc
    fi
-
+    # Development Packages
+    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
    # Install Intel Support Packages
    apt-get install -y ${XPU_PACKAGES}

@ -139,11 +130,11 @@ function install_sles() {

 }

-# Default use GPU driver rolling releases
-XPU_DRIVER_VERSION=""
-if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
-    # Use GPU driver LTS releases
-    XPU_DRIVER_VERSION="/lts/2350"
+# Default use GPU driver LTS releases
+XPU_DRIVER_VERSION="/lts/2350"
+if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
+    # Use GPU driver rolling releases
+    XPU_DRIVER_VERSION=""
 fi

 # Default use Intel® oneAPI Deep Learning Essentials 2025.0
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -63,12 +63,11 @@ lark==0.12.0
 #Pinned versions: 0.12.0
 #test that import:

-librosa>=0.6.2 ; python_version < "3.11" and platform_machine != "s390x"
-librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
+librosa>=0.6.2 ; python_version < "3.11"
+librosa==0.10.2 ; python_version == "3.12"
 #Description: A python package for music and audio analysis
 #Pinned versions: >=0.6.2
 #test that import: test_spectral_ops.py
-#librosa depends on numba; disable it for s390x while numba is disabled too

 #mkl #this breaks linux-bionic-rocm4.5-py3.7
 #Description: Intel oneAPI Math Kernel Library
@ -111,15 +110,14 @@ ninja==1.11.1.3
 #Pinned versions: 1.11.1.3
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

-numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
-numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
-numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
-numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
+numba==0.49.0 ; python_version < "3.9"
+numba==0.55.2 ; python_version == "3.9"
+numba==0.55.2 ; python_version == "3.10"
+numba==0.60.0 ; python_version == "3.12"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
 #For numba issue see https://github.com/pytorch/pytorch/issues/51511
-#Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073

 #numpy
 #Description: Provides N-dimensional arrays and linear algebra
@ -309,7 +307,7 @@ pytest-cpp==2.3.0
 #Pinned versions: 2.3.0
 #test that import:

-z3-solver==4.15.1.0 ; platform_machine != "s390x"
+z3-solver==4.15.1.0
 #Description: The Z3 Theorem Prover Project
 #Pinned versions:
 #test that import:
@ -363,6 +361,7 @@ pwlf==2.2.1
 #Pinned versions: 2.2.1
 #test that import: test_sac_estimator.py

+
 # To build PyTorch itself
 pyyaml
 pyzstd
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
@ -50,7 +50,7 @@ IPython==8.12.0
 #Pinned versions: 8.12.0

 myst-nb==0.17.2
-#Description: This is used to generate PyTorch functorch and torch.compile docs.
+#Description: This is used to generate PyTorch functorch and torch.compile docs
 #Pinned versions: 0.17.2

 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -98,9 +98,8 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
-COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -98,9 +98,8 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
-COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

 ARG TRITON
 ARG TRITON_CPU
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -138,11 +138,28 @@ fi

 echo "Calling setup.py bdist at $(date)"

-time CMAKE_ARGS=${CMAKE_ARGS[@]} \
-    EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
+if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
+    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
+    BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 \
    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
+    echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
+    echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
+    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
+    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \
+    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
+    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
+    CMAKE_FRESH=1 python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
+    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
+else
+    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
+        EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
+        BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
+        USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
+        python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
+fi
 echo "Finished setup.py bdist at $(date)"

 # Build libtorch packages
@ -255,6 +272,10 @@ ls /tmp/$WHEELHOUSE_DIR
 mkdir -p "/$WHEELHOUSE_DIR"
 mv /tmp/$WHEELHOUSE_DIR/torch*linux*.whl /$WHEELHOUSE_DIR/

+if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    mv /tmp/$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/ || true
+fi
+
 if [[ -n "$BUILD_PYTHONLESS" ]]; then
    mkdir -p /$LIBTORCH_HOUSE_DIR
    mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR
@ -431,8 +452,16 @@ if [[ -z "$BUILD_PYTHONLESS" ]]; then
  pushd $PYTORCH_ROOT/test

  # Install the wheel for this Python version
+  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    pip uninstall -y "$TORCH_NO_PYTHON_PACKAGE_NAME" || true
+  fi
+
  pip uninstall -y "$TORCH_PACKAGE_NAME"

+  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    pip install "$TORCH_NO_PYTHON_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
+  fi
+
  pip install "$TORCH_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v

  # Print info on the libraries installed in this wheel
--- a/.ci/manywheel/build_rocm.sh
+++ b/.ci/manywheel/build_rocm.sh
@ -194,7 +194,7 @@ ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
 ROCBLAS_LIB_DST=lib/rocblas/library
 ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
 ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
-ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $ROCBLAS_OTHER_FILES)
+ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES)

 # hipblaslt library files
 HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -50,6 +50,9 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
  export ATEN_THREADING=NATIVE
 fi

+# Enable LLVM dependency for TensorExpr testing
+export USE_LLVM=/opt/llvm
+export LLVM_DIR=/opt/llvm/lib/cmake/llvm

 if ! which conda; then
  # In ROCm CIs, we are doing cross compilation on build machines with
@ -173,7 +176,7 @@ fi

 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
  export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2"
 fi

@ -189,6 +192,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
  export USE_ASAN=1
  export REL_WITH_DEB_INFO=1
  export UBSAN_FLAGS="-fno-sanitize-recover=all"
+  unset USE_LLVM
 fi

 if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then
@ -261,13 +265,22 @@ else

      WERROR=1 python setup.py clean

-      WERROR=1 python setup.py bdist_wheel
+      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+        python3 tools/packaging/split_wheel.py bdist_wheel
+      else
+        WERROR=1 python setup.py bdist_wheel
+      fi
    else
      python setup.py clean
      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
        source .ci/pytorch/install_cache_xla.sh
      fi
-      python setup.py bdist_wheel
+      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+        echo "USE_SPLIT_BUILD cannot be used with xla or rocm"
+        exit 1
+      else
+        python setup.py bdist_wheel
+      fi
    fi
    pip_install_whl "$(echo dist/*.whl)"

--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -229,6 +229,7 @@ function install_torchrec_and_fbgemm() {

    pip_install tabulate  # needed for newer fbgemm
    pip_install patchelf  # needed for rocm fbgemm
+    pushd /tmp

    local wheel_dir=dist/fbgemm_gpu
    local found_whl=0
@ -244,7 +245,7 @@ function install_torchrec_and_fbgemm() {
    if [ "${found_whl}" == "0" ]; then
      git clone --recursive https://github.com/pytorch/fbgemm
      pushd fbgemm/fbgemm_gpu
-      git checkout "${fbgemm_commit}" --recurse-submodules
+      git checkout "${fbgemm_commit}"
      python setup.py bdist_wheel \
        --build-variant=rocm \
        -DHIP_ROOT_DIR="${ROCM_PATH}" \
@ -263,6 +264,7 @@ function install_torchrec_and_fbgemm() {
    done

    rm -rf fbgemm
+    popd
  else
    pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
    pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
@ -281,6 +283,30 @@ function clone_pytorch_xla() {
  fi
 }

+function checkout_install_torchbench() {
+  local commit
+  commit=$(get_pinned_commit torchbench)
+  git clone https://github.com/pytorch/benchmark torchbench
+  pushd torchbench
+  git checkout "$commit"
+
+  if [ "$1" ]; then
+    python install.py --continue_on_fail models "$@"
+  else
+    # Occasionally the installation may fail on one model but it is ok to continue
+    # to install and test other models
+    python install.py --continue_on_fail
+  fi
+
+  # TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
+  # is regressing speedup metric. This needs to be investigated further
+  pip install transformers==4.38.1
+
+  echo "Print all dependencies after TorchBench is installed"
+  python -mpip freeze
+  popd
+}
+
 function install_torchao() {
  local commit
  commit=$(get_pinned_commit torchao)
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -157,29 +157,6 @@ test_jit_hooks() {
  assert_git_not_dirty
 }

-# Shellcheck doesn't like it when you pass no arguments to a function
-# that can take args. See https://www.shellcheck.net/wiki/SC2120
-# shellcheck disable=SC2120
-checkout_install_torchbench() {
-  local commit
-  commit=$(cat .ci/docker/ci_commit_pins/torchbench.txt)
-  git clone https://github.com/pytorch/benchmark torchbench
-  pushd torchbench
-  git checkout "$commit"
-
-  if [ "$1" ]; then
-    python install.py --continue_on_fail models "$@"
-  else
-    # Occasionally the installation may fail on one model but it is ok to continue
-    # to install and test other models
-    python install.py --continue_on_fail
-  fi
-
-  echo "Print all dependencies after TorchBench is installed"
-  python -mpip freeze
-  popd
-}
-
 torchbench_setup_macos() {
  git clone --recursive https://github.com/pytorch/vision torchvision
  git clone --recursive https://github.com/pytorch/audio torchaudio
@ -202,6 +179,8 @@ torchbench_setup_macos() {
  USE_OPENMP=0 python setup.py develop
  popd

+  # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
+  # shellcheck disable=SC2119,SC2120
  checkout_install_torchbench
 }

--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -627,8 +627,6 @@ test_perf_for_dashboard() {
    device=cuda_a10g
  elif [[ "${TEST_CONFIG}" == *h100* ]]; then
    device=cuda_h100
-  elif [[ "${TEST_CONFIG}" == *b200* ]]; then
-    device=cuda_b200
  elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
    device=rocm
  fi
@ -803,16 +801,6 @@ test_dynamo_benchmark() {
  if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
    test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
-    # TODO (huydhn): Just smoke test some sample models
-    if [[ "${TEST_CONFIG}" == *b200* ]]; then
-      if [[ "${suite}" == "huggingface" ]]; then
-        export TORCHBENCH_ONLY_MODELS="DistillGPT2"
-      elif [[ "${suite}" == "timm_models" ]]; then
-        export TORCHBENCH_ONLY_MODELS="inception_v3"
-      elif [[ "${suite}" == "torchbench" ]]; then
-        export TORCHBENCH_ONLY_MODELS="hf_Bert"
-      fi
-    fi
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
  else
    if [[ "${TEST_CONFIG}" == *cpu* ]]; then
@ -1051,10 +1039,20 @@ test_libtorch_api() {
    mkdir -p $TEST_REPORTS_DIR

    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
+    "$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
  else
    # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"

+    # On s390x, pytorch is built without llvm.
+    # Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
+    # test fails with errors like:
+    # JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
+    # unknown file: Failure
+    # C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
+    if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
+      python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
+    fi
  fi

  # quantization is not fully supported on s390x yet
@ -1674,11 +1672,13 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
 elif [[ "${TEST_CONFIG}" == cachebench ]]; then
  install_torchaudio
  install_torchvision
-  PYTHONPATH=/torchbench test_cachebench
+  checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
+  PYTHONPATH=$(pwd)/torchbench test_cachebench
 elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
  install_torchaudio
  install_torchvision
-  PYTHONPATH=/torchbench test_verify_cachebench
+  checkout_install_torchbench nanogpt
+  PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  install_torchaudio
  install_torchvision
@ -1687,22 +1687,28 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
  if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
-    PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf
+    checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
+    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
-    PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf
+    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
+      llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
+      functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
+    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
-    TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest
+    checkout_install_torchbench
+    TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
  else
+    checkout_install_torchbench
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
    if [[ "${TEST_CONFIG}" != *cpu* ]]; then
      install_torchrec_and_fbgemm
    fi
-    PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
+    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
  fi
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
-  PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
+  PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
    test_inductor_aoti
  fi
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -192,6 +192,9 @@ retry brew install libomp
 # For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

+if [[ -n "$CROSS_COMPILE_ARM64" ]]; then
+    export CMAKE_OSX_ARCHITECTURES=arm64
+fi
 export USE_MKLDNN=OFF
 export USE_QNNPACK=OFF
 export BUILD_TEST=OFF
@ -199,7 +202,16 @@ export BUILD_TEST=OFF
 pushd "$pytorch_rootdir"
 echo "Calling setup.py bdist_wheel at $(date)"

-python setup.py bdist_wheel -d "$whl_tmp_dir"
+if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
+    BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel -d "$whl_tmp_dir"
+    echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
+    echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
+    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 CMAKE_FRESH=1 python setup.py bdist_wheel -d "$whl_tmp_dir"
+    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
+else
+    python setup.py bdist_wheel -d "$whl_tmp_dir"
+fi

 echo "Finished setup.py bdist_wheel at $(date)"

--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -65,8 +65,16 @@ fi

 if [[ "$PACKAGE_TYPE" != libtorch ]]; then
  if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
-    pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
-    retry pip install -q numpy protobuf typing-extensions
+    if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+      pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)"
+      pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)"
+      # todo: after folder is populated use the pypi_pkg channel instead
+      pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg"
+      retry pip install -q numpy protobuf typing-extensions
+    else
+      pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
+      retry pip install -q numpy protobuf typing-extensions
+    fi
  else
    pip install "\$pkg"
    retry pip install -q numpy protobuf typing-extensions
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -134,6 +134,7 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}"
 export DESIRED_CUDA="$DESIRED_CUDA"
 export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
 export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
+export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}"
 if [[ "${OSTYPE}" == "msys" ]]; then
  export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
  if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -23,6 +23,10 @@ if [[ "${DRY_RUN}" = "disabled" ]]; then
  AWS_S3_CP="aws s3 cp"
 fi

+if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
+  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
+fi
+
 # this is special build with all dependencies packaged
 if [[ ${BUILD_NAME} == *-full* ]]; then
  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -24,6 +24,7 @@ runs:
          -e PYTORCH_FINAL_PACKAGE_DIR \
          -e PYTORCH_ROOT \
          -e SKIP_ALL_TESTS \
+          -e USE_SPLIT_BUILD \
          --tty \
          --detach \
          -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-e500f0cf88bc57ffd8b0029033da305eef24ae25
+bf305f538005f2e900f8850ed57146024a8bc559
--- a/.ci/docker/ci_commit_pins/torchbench.txt
+++ b/.ci/docker/ci_commit_pins/torchbench.txt
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-35afe1b30b154114dc2ee8329e12f8cf3fe9f576
+ca9e2be3ed6320b51f52f536595cd24e254f8bb2
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-095faec1e7b6cc47220181e74ae9cde2605f9b00
+29ae4c76c026185f417a25e841d2cd5e65f087a3
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -488,10 +488,6 @@
  - torch/_dynamo/**
  - torch/csrc/dynamo/**
  - test/dynamo/**
-  - test/dynamo_expected_failures/**
-  - test/dynamo_skips/**
-  - test/inductor_expected_failures/**
-  - test/inductor_skips/**
  approved_by:
  - guilhermeleobas
  mandatory_checks_name:
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -193,7 +193,7 @@ LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
    "cpu": "libtorch-cxx11-builder:cpu",
 }

-FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
+FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]


 def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@ -273,6 +273,7 @@ def generate_wheels_matrix(
    os: str,
    arches: Optional[list[str]] = None,
    python_versions: Optional[list[str]] = None,
+    use_split_build: bool = False,
 ) -> list[dict[str, str]]:
    package_type = "wheel"
    if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
@ -314,11 +315,15 @@ def generate_wheels_matrix(
            # TODO: Enable python 3.13t on cpu-s390x
            if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
                continue
-            # TODO: Enable python 3.14 on non linux OSes
-            if os != "linux" and (
-                python_version == "3.14" or python_version == "3.14t"
+
+            if use_split_build and (
+                arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux"
            ):
-                continue
+                raise RuntimeError(
+                    "Split build is only supported on linux with cuda 12* and cpu.\n"
+                    f"Currently attempting to build on arch version {arch_version} and os {os}.\n"
+                    "Please modify the matrix generation to exclude this combination."
+                )

            # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install

@ -334,6 +339,7 @@ def generate_wheels_matrix(
                        "gpu_arch_type": gpu_arch_type,
                        "gpu_arch_version": gpu_arch_version,
                        "desired_cuda": desired_cuda,
+                        "use_split_build": "True" if use_split_build else "False",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
                            ":"
                        )[0],
@ -366,6 +372,7 @@ def generate_wheels_matrix(
                            "desired_cuda": translate_desired_cuda(
                                gpu_arch_type, gpu_arch_version
                            ),
+                            "use_split_build": "True" if use_split_build else "False",
                            "container_image": WHEEL_CONTAINER_IMAGES[
                                arch_version
                            ].split(":")[0],
@ -388,6 +395,7 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
+                        "use_split_build": "True" if use_split_build else "False",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
                            ":"
                        )[0],
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -59,7 +59,9 @@ class BinaryBuildWorkflow:
    is_scheduled: str = ""
    branches: str = "nightly"
    # Mainly for macos
+    cross_compile_arm64: bool = False
    macos_runner: str = "macos-14-xlarge"
+    use_split_build: bool = False
    # Mainly used for libtorch builds
    build_variant: str = ""

@ -70,6 +72,9 @@ class BinaryBuildWorkflow:
                for item in [self.os, "binary", self.package_type, self.build_variant]
                if item != ""
            )
+        if self.use_split_build:
+            # added to distinguish concurrency groups
+            self.build_environment += "-split"

    def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
        output_file_path = (
@ -112,6 +117,21 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
            isolated_workflow=True,
        ),
    ),
+    # See https://github.com/pytorch/pytorch/issues/138750
+    #   BinaryBuildWorkflow(
+    #     os=OperatingSystem.LINUX,
+    #     package_type="manywheel",
+    #     build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+    #         OperatingSystem.LINUX,
+    #         use_split_build=True,
+    #         arches=["11.8", "12.1", "12.4", "cpu"],
+    #     ),
+    #     ciflow_config=CIFlowConfig(
+    #         labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+    #         isolated_workflow=True,
+    #     ),
+    #     use_split_build=True,
+    # ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
@ -155,11 +175,27 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        package_type="manywheel",
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
-            arches=["12.8"],
-            python_versions=["3.12"],
+            arches=["12.6", "12.8", "12.9"],
+            python_versions=["3.9"],
        ),
        branches="main",
    ),
+    # See https://github.com/pytorch/pytorch/issues/138750
+    # BinaryBuildWorkflow(
+    #     os=OperatingSystem.LINUX,
+    #     package_type="manywheel",
+    #     build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+    #         OperatingSystem.LINUX,
+    #         arches=["11.8", "12.1", "12.4"],
+    #         python_versions=["3.9"],
+    #         use_split_build=True,
+    #     ),
+    #     ciflow_config=CIFlowConfig(
+    #         labels={LABEL_CIFLOW_PERIODIC},
+    #     ),
+    #     branches="main",
+    #     use_split_build=True,
+    # ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
@ -302,6 +338,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
            generate_binary_build_matrix.RELEASE,
            libtorch_variants=["shared-with-deps"],
        ),
+        cross_compile_arm64=False,
        macos_runner="macos-14-xlarge",
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@ -314,6 +351,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.MACOS_ARM64
        ),
+        cross_compile_arm64=False,
        macos_runner="macos-14-xlarge",
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -262,12 +262,7 @@ def is_exception_branch(branch: str) -> bool:
    """
    Branches that get opted out of experiments by default, until they're explicitly enabled.
    """
-    return branch.split("/", maxsplit=1)[0] in {
-        "main",
-        "nightly",
-        "release",
-        "landchecks",
-    }
+    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}


 def load_yaml(yaml_text: str) -> Any:
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -47,6 +47,9 @@ env:
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SKIP_ALL_TESTS: 0
+{%- if cross_compile_arm64 %}
+  CROSS_COMPILE_ARM64: 1
+{% endif %}
 !{{ common.concurrency(build_environment) }}

 jobs:
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -25,6 +25,11 @@
      DOCKER_IMAGE: !{{ config["container_image"] }}
      DOCKER_IMAGE_TAG_PREFIX: !{{ config["container_image_tag_prefix"] }}
 {%- endif %}
+{%- if config["package_type"] == "manywheel" %}
+  {%- if config.use_split_build is defined %}
+      use_split_build: !{{ config["use_split_build"] }}
+  {%- endif %}
+{%- endif %}
 {%- if config["package_type"] == "libtorch" %}
  {%- if config["libtorch_config"] %}
      LIBTORCH_CONFIG: !{{ config["libtorch_config"] }}
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -26,6 +26,13 @@ on:
        default: 240
        type: number
        description: timeout for the job
+      use_split_build:
+        description: |
+          [Experimental] Build a libtorch only wheel and build pytorch such that
+          are built from the libtorch wheel.
+        required: false
+        type: boolean
+        default: false
      ALPINE_IMAGE:
        required: false
        type: string
@ -110,6 +117,7 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Make the env permanent during this workflow (but not the secrets)
        shell: bash
@ -134,6 +142,7 @@ jobs:
            echo "PR_NUMBER=${{ env.PR_NUMBER }}"
            echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
            echo "SHA1=${{ env.SHA1 }}"
+            echo "USE_SPLIT_BUILD=${{ env.use_split_build }}"
          } >> "${GITHUB_ENV} }}"

      - name: List the env
@ -252,6 +261,7 @@ jobs:
            -e PYTORCH_ROOT \
            -e SKIP_ALL_TESTS \
            -e PYTORCH_EXTRA_INSTALL_REQUIREMENTS \
+            -e USE_SPLIT_BUILD \
            --tty \
            --detach \
            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -64,6 +64,13 @@ on:
        required: true
        type: string
        description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu
+      use_split_build:
+        description: |
+          [Experimental] Build a libtorch only wheel and build pytorch such that
+          are built from the libtorch wheel.
+        required: false
+        type: boolean
+        default: false
    secrets:
      github-token:
        required: true
@ -97,6 +104,7 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Make the env permanent during this workflow (but not the secrets)
        shell: bash
@ -121,6 +129,7 @@ jobs:
            echo "PR_NUMBER=${{ env.PR_NUMBER }}"
            echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
            echo "SHA1=${{ env.SHA1 }}"
+            echo "USE_SPLIT_BUILD=${{ env.USE_SPLIT_BUILD }}"
          } >> "${GITHUB_ENV} }}"

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@ -51,6 +51,13 @@ on:
        required: false
        type: string
        description: Desired python version
+      use_split_build:
+        description: |
+          [Experimental] Build a libtorch only wheel and build pytorch such that
+          are built from the libtorch wheel.
+        required: false
+        type: boolean
+        default: false
    secrets:
      github-token:
        required: true
@ -79,6 +86,7 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -306,6 +306,7 @@ jobs:
            -e OUR_GITHUB_JOB_ID \
            -e HUGGING_FACE_HUB_TOKEN \
            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
+            -e USE_SPLIT_BUILD \
            -e BUILD_ADDITIONAL_PACKAGES \
            --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
            --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -96,7 +96,7 @@ jobs:
    steps:
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+        if: ${{ matrix.runner != 'B200' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
@ -109,7 +109,7 @@ jobs:
          no-sudo: true

      - name: Setup Python
-        if: contains(matrix.runner, 'b200')
+        if: matrix.runner == 'B200'
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
          python-version: '3.12'
@ -117,7 +117,7 @@ jobs:

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
-        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && !contains(matrix.runner, 'b200')
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && matrix.runner != 'B200'

      - name: configure aws credentials
        if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
@ -128,7 +128,7 @@ jobs:
          aws-region: us-east-1

      - name: Login to Amazon ECR
-        if: ${{ inputs.aws-role-to-assume != '' && contains(matrix.runner, 'b200') }}
+        if: ${{ inputs.aws-role-to-assume != '' && matrix.runner == 'B200' }}
        id: login-ecr
        continue-on-error: true
        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
@ -166,17 +166,17 @@ jobs:
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
        with:
          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
-        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && matrix.runner != 'B200' }}

      - name: Setup GPU_FLAG for docker run
        id: setup-gpu-flag
        run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
-        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || contains(matrix.runner, 'b200')) }}
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || matrix.runner == 'B200') }}

      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
        id: setup-sscache-port-flag
        run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
-        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && !contains(matrix.runner, 'b200') }}
+        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && matrix.runner != 'B200' }}

      - name: Lock NVIDIA A100 40GB Frequency
        run: |
@ -277,8 +277,8 @@ jobs:
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
          # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
-          SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
-          SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
+          SCCACHE_BUCKET: ${{ matrix.runner != 'B200' && 'ossci-compiler-cache-circleci-v2' || '' }}
+          SCCACHE_REGION: ${{ matrix.runner != 'B200' && 'us-east-1' || '' }}
          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
@ -403,7 +403,7 @@ jobs:
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}

      - name: Authenticate with AWS
-        if: ${{ contains(matrix.runner, 'b200') }}
+        if: ${{ matrix.runner == 'B200' }}
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@ -34,8 +34,7 @@ jobs:
      contents: read
      pull-requests: write
    name: Check labels
-    # Disabling the job until https://github.com/pytorch/pytorch/issues/159825 is resolved
-    if: github.repository_owner == 'pytorch' && false
+    if: github.repository_owner == 'pytorch'
    runs-on: linux.24_04.4x
    steps:
      - name: Checkout PyTorch
--- a/.github/workflows/check_mergeability_ghstack.yml
+++ b/.github/workflows/check_mergeability_ghstack.yml
@ -7,8 +7,7 @@ on:

 jobs:
  ghstack-mergeability-check:
-    # Disabling the job until https://github.com/pytorch/pytorch/issues/159825 is resolved
-    if: github.repository_owner == 'pytorch' && false
+    if: github.repository_owner == 'pytorch'
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -51,17 +51,21 @@ jobs:
        docker-image-name: [
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
+          pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
          pytorch-linux-jammy-py3.9-clang12,
+          pytorch-linux-jammy-py3.11-clang12,
+          pytorch-linux-jammy-py3.12-clang12,
          pytorch-linux-jammy-py3.13-clang12,
          pytorch-linux-jammy-rocm-n-py3,
          pytorch-linux-noble-rocm-n-py3,
          pytorch-linux-noble-rocm-alpha-py3,
-          pytorch-linux-jammy-rocm-n-py3-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
          pytorch-linux-jammy-py3.9-gcc11,
          pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
@ -72,8 +76,7 @@ jobs:
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
-          # Executorch pin needs update
-          # pytorch-linux-jammy-py3-clang12-executorch,
+          pytorch-linux-jammy-py3-clang12-executorch,
          pytorch-linux-jammy-py3.12-triton-cpu
        ]
        include:
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -60,6 +60,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -83,6 +84,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -106,6 +108,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-aarch64
    secrets:
@ -126,6 +129,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -152,6 +156,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda-aarch64-12_9
    secrets:
@ -171,6 +176,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -194,6 +200,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -217,6 +224,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-aarch64
    secrets:
@ -237,6 +245,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -263,6 +272,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
    secrets:
@ -282,6 +292,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -305,6 +316,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -328,6 +340,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-aarch64
    secrets:
@ -348,6 +361,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -374,6 +388,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
    secrets:
@ -393,6 +408,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -416,6 +432,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -439,6 +456,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-aarch64
    secrets:
@ -459,6 +477,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -485,6 +504,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
    secrets:
@ -504,6 +524,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -527,6 +548,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -550,6 +572,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-aarch64
    secrets:
@ -570,6 +593,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -596,6 +620,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
    secrets:
@ -615,6 +640,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -638,6 +664,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      build_name: manywheel-py3_13t-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -661,6 +688,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      build_name: manywheel-py3_13t-cpu-aarch64
    secrets:
@ -681,6 +709,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -707,6 +736,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
    secrets:
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -42,7 +42,54 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_12-cuda12_8-build:
+  manywheel-py3_9-cuda12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_6
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda12_6-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_6
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_9-cuda12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
@ -56,17 +103,18 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      DESIRED_PYTHON: "3.12"
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_12-cuda12_8
+      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda12_8-test:  # Testing
+  manywheel-py3_9-cuda12_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
-      - manywheel-py3_12-cuda12_8-build
+      - manywheel-py3_9-cuda12_8-build
      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
@ -79,8 +127,56 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda12_8
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_8
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_9-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_9
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@ -58,6 +58,7 @@ jobs:
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-rocm6_4
@ -82,6 +83,7 @@ jobs:
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
    steps:
      - name: Setup ROCm
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -60,6 +60,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -83,6 +84,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -105,6 +107,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
    secrets:
@ -124,6 +127,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -147,6 +151,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -169,6 +174,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
    secrets:
@ -188,6 +194,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -211,6 +218,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -233,6 +241,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
    secrets:
@ -252,6 +261,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -275,6 +285,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -297,6 +308,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
    secrets:
@ -316,6 +328,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.13"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -339,6 +352,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -361,6 +375,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
    secrets:
--- a/.github/workflows/inductor-perf-test-b200.yml
+++ b/.github/workflows/inductor-perf-test-b200.yml
@ -1,154 +0,0 @@
-name: inductor-perf-b200
-
-on:
-  schedule:
-    - cron: 0 7 * * 1-6
-    - cron: 0 7 * * 0
-  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
-  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
-  workflow_dispatch:
-    inputs:
-      training:
-        description: Run training (on by default)?
-        required: false
-        type: boolean
-        default: true
-      inference:
-        description: Run inference (on by default)?
-        required: false
-        type: boolean
-        default: true
-      default:
-        description: Run inductor_default?
-        required: false
-        type: boolean
-        default: false
-      dynamic:
-        description: Run inductor_dynamic_shapes?
-        required: false
-        type: boolean
-        default: false
-      cppwrapper:
-        description: Run inductor_cpp_wrapper?
-        required: false
-        type: boolean
-        default: false
-      cudagraphs:
-        description: Run inductor_cudagraphs?
-        required: false
-        type: boolean
-        default: true
-      freezing_cudagraphs:
-        description: Run inductor_cudagraphs with freezing for inference?
-        required: false
-        type: boolean
-        default: false
-      aotinductor:
-        description: Run aot_inductor for inference?
-        required: false
-        type: boolean
-        default: false
-      maxautotune:
-        description: Run inductor_max_autotune?
-        required: false
-        type: boolean
-        default: false
-      benchmark_configs:
-        description: The list of configs used the benchmark
-        required: false
-        type: string
-        default: inductor_huggingface_perf_cuda_b200,inductor_timm_perf_cuda_b200,inductor_torchbench_perf_cuda_b200
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-      opt_out_experiments: lf
-
-  build:
-    name: cuda12.8-py3.10-gcc9-sm100
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      # Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
-      # or newer GPUs, so it doesn't benefit much from existing compiler cache
-      # from trunk. Also use a memory-intensive runner here because memory is
-      # usually the bottleneck
-      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '10.0'
-      test-matrix: |
-        { include: [
-          { config: "inductor_huggingface_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
-          { config: "inductor_timm_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
-          { config: "inductor_torchbench_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
-        ]}
-      selected-test-configs: ${{ inputs.benchmark_configs }}
-      build-additional-packages: "vision audio fbgemm torchao"
-    secrets: inherit
-
-  test-periodically:
-    name: cuda12.8-py3.10-gcc9-sm100
-    uses: ./.github/workflows/_linux-test.yml
-    needs: build
-    if: github.event.schedule == '0 7 * * 1-6'
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
-      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
-      docker-image: ${{ needs.build.outputs.docker-image }}
-      test-matrix: ${{ needs.build.outputs.test-matrix }}
-      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-      timeout-minutes: 720
-      disable-monitor: false
-      monitor-log-interval: 15
-      monitor-data-collect-interval: 4
-    secrets: inherit
-
-  test-weekly:
-    name: cuda12.8-py3.10-gcc9-sm100
-    uses: ./.github/workflows/_linux-test.yml
-    needs: build
-    if: github.event.schedule == '0 7 * * 0'
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
-      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
-      docker-image: ${{ needs.build.outputs.docker-image }}
-      test-matrix: ${{ needs.build.outputs.test-matrix }}
-      timeout-minutes: 1440
-      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-      disable-monitor: false
-      monitor-log-interval: 15
-      monitor-data-collect-interval: 4
-    secrets: inherit
-
-  test:
-    name: cuda12.8-py3.10-gcc9-sm100
-    uses: ./.github/workflows/_linux-test.yml
-    needs: build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
-      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
-      docker-image: ${{ needs.build.outputs.docker-image }}
-      test-matrix: ${{ needs.build.outputs.test-matrix }}
-      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-      timeout-minutes: 720
-      disable-monitor: false
-      monitor-log-interval: 15
-      monitor-data-collect-interval: 4
-    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-rocm.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml
@ -85,7 +85,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-rocm-py3_10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -81,21 +81,21 @@ jobs:
      sync-tag: rocm-build
      test-matrix: |
        { include: [
-          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
        ]}
    secrets: inherit

--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -75,11 +75,10 @@ jobs:
            repo-owner: pytorch
            branch: main
            pin-folder: .github/ci_commit_pins
-          # executorch jobs are disabled since it needs some manual work for the hash update
-          # - repo-name: executorch
-          #   repo-owner: pytorch
-          #   branch: main
-          #   pin-folder: .ci/docker/ci_commit_pins
+          - repo-name: executorch
+            repo-owner: pytorch
+            branch: main
+            pin-folder: .ci/docker/ci_commit_pins
          - repo-name: triton
            repo-owner: triton-lang
            branch: main
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -51,6 +51,37 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

+  linux-jammy-cuda12_4-py3_10-gcc11-sm89-build:
+    name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
+      cuda-arch-list: 8.9
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_4-py3_10-gcc11-sm89-test:
+    name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_4-py3_10-gcc11-sm89-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
+      docker-image: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-cuda12_4-py3_10-gcc11-build:
    name: linux-jammy-cuda12.4-py3.10-gcc11
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -254,6 +254,36 @@ jobs:
      timeout-minutes: 600
    secrets: inherit

+  linux-jammy-cuda12_8-py3_10-gcc11-build-distributed:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '7.5'
+      test-matrix: |
+        { include: [
+          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-test-distributed:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed
+      - target-determination
+    with:
+      timeout-minutes: 360
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-cuda12_8-py3_10-gcc11-build:
    name: linux-jammy-cuda12.8-py3.10-gcc11
    uses: ./.github/workflows/_linux-build.yml
@ -262,18 +292,13 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: '7.5 8.9'
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit

@ -304,6 +329,30 @@ jobs:
        ]}
    secrets: inherit

+  linux-jammy-py3_9-clang9-xla-build:
+    name: linux-jammy-py3_9-clang9-xla
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.9-clang9-xla
+      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
+      test-matrix: |
+        { include: [
+          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3_9-clang9-xla-test:
+    name: linux-jammy-py3_9-clang9-xla
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3_9-clang9-xla-build
+    with:
+      build-environment: linux-jammy-py3.9-clang9-xla
+      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-cpu-py3_10-gcc11-bazel-test:
    name: linux-jammy-cpu-py3.10-gcc11-bazel-test
    uses: ./.github/workflows/_bazel-build-test.yml
@ -353,8 +402,38 @@ jobs:
        ]}
    secrets: inherit

+  linux-jammy-cuda12_8-py3_10-gcc11-sm89-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: 8.9
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm89-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm89-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-py3-clang12-executorch-build:
-    if: false  # Docker build needs pin update
    name: linux-jammy-py3-clang12-executorch
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
@ -379,6 +458,31 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
    secrets: inherit

+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm75
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '7.5'
+      test-matrix: |
+        { include: [
+          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm75
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-xpu-2025_1-py3_9-build:
    name: linux-jammy-xpu-2025.1-py3.9
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@ -10,10 +10,6 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions:
-  id-token: write
-  contents: read
-
 jobs:
  get-default-label-prefix:
    if: github.repository_owner == 'pytorch'
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -205,7 +205,7 @@ jobs:
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
      test-matrix: |
        { include: [
          { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@ -12,9 +12,7 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

-permissions:
-  id-token: write
-  contents: read
+permissions: read-all

 jobs:
  # There must be at least one job here to satisfy GitHub action workflow syntax
@ -53,27 +51,3 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-py3_9-clang9-xla-build:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3_9-clang9-xla-test:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3_9-clang9-xla-build
-    with:
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
-    secrets: inherit
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@ -23,7 +23,7 @@ jobs:
        with:
          repository: pytorch/pytorch
          stable-branch: viable/strict
-          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]'
+          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
          secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
          clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
          clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
--- a/.gitignore
+++ b/.gitignore
@ -146,9 +146,6 @@ merge_record.json
 torchgen/packaged/*
 !torchgen/packaged/README.md

-# This file is injected by ROCm build scripts to bootstrap in torch/__init__.py.
-torch/_rocm_init.py
-
 # IPython notebook checkpoints
 .ipynb_checkpoints

--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1452,6 +1452,8 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
+    '--no-black-binary',
+    'black==23.12.1',
    'usort==1.0.8.post1',
    'isort==6.0.1',
    'ruff==0.12.2',  # sync with RUFF
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,12 @@
+repos:
+  - repo: local
+    hooks:
+      - id: lintrunner
+        name: Run Lintrunner in an isolated venv before every push. The first run may be slow...
+        entry: python scripts/run_lintrunner.py   # wrapper below
+        language: python                          # pre‑commit manages venv for the wrapper
+        additional_dependencies: []               # wrapper handles lintrunner install
+        always_run: true
+        stages: [pre-push]                        # fire only on pre‑push
+        pass_filenames: false                     # Lintrunner gets no per‑file args
+        verbose: true                             # stream output as it is produced...allegedly anyways
--- a/AGENTS.md
+++ b/AGENTS.md
@ -1,17 +1 @@
 - This is the only AGENTS.md, there are no recursive AGENTS.md
- When you are working on a bug, first create a standalone file that
-  reproduces the bug and verify it fails in the expected way.  Use this to
-  test if your changes work.  Once the change is passing, find an appropriate
-  test file to add the test to and make sure to follow local conventions on
-  the test file.
- If you are running the real test suite, DO NOT run the entire test suite.
-  Instead run only a single test case, e.g., 'python test/test_torch.py TestTorch.test_dir'
- Do NOT run setup.py, you do not have a working build environment
- Do NOT run pre-commit, it is not setup
- To run lint, run 'lintrunner -a' (which will autoapply changes)
- Do NOT attempt to install dependencies, you do not have Internet access
- When you are ready to make a PR, do exactly these steps:
-  - git stash -u
-  - git reset --hard $(cat /tmp/orig_work.txt) # NB: reset to the LOCAL branch, do NOT fetch
-  - git stash pop
-  - Resolve conflicts if necessary
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -239,9 +239,7 @@ option(USE_XPU "Use XPU" ON)
 cmake_dependent_option(
  BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
  "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
-cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX OR WIN32" OFF)
-cmake_dependent_option(USE_ROCM_CK_GEMM "Use ROCm Composable Kernel for GEMMs" ON "USE_ROCM;NOT WIN32" OFF)
-option(USE_ROCM_CK_SDPA "Use ROCm Composable Kernel for SDPA" OFF)
+cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
 option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
 cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
@ -253,6 +251,7 @@ cmake_dependent_option(USE_CUFILE "Use cuFile" ON "USE_CUDA AND NOT WIN32" OFF)
 option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
+option(USE_FAKELOWP "Use FakeLowp operators" OFF)
 option(USE_GFLAGS "Use GFLAGS" OFF)
 option(USE_GLOG "Use GLOG" OFF)
 option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
@ -261,13 +260,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
-option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
-                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+                       "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
                       "USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
-cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
 cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
                       OFF)
@ -325,6 +322,7 @@ set(MKLDNN_ENABLE_CONCURRENT_EXEC ${USE_MKLDNN})
 cmake_dependent_option(USE_MKLDNN_CBLAS "Use CBLAS in MKLDNN" OFF "USE_MKLDNN"
                       OFF)
 option(USE_STATIC_MKL "Prefer to link with MKL statically (Unix only)" OFF)
+option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(
  USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
  "USE_DISTRIBUTED" OFF)
@ -836,11 +834,10 @@ include(ExternalProject)

 # ---[ Dependencies ---[ FBGEMM doesn't work on x86 32bit and
 # CMAKE_SYSTEM_PROCESSOR thinks its 64bit
-if(USE_FBGEMM AND NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
-  message(WARNING
-    "x64 operating system is required for FBGEMM. "
-    "Not compiling with FBGEMM. "
-    "Turn this warning off by USE_FBGEMM=OFF.")
+if(USE_FBGEMM
+   AND((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SIZEOF_VOID_P EQUAL
+                                                      4)
+        OR CMAKE_SYSTEM_PROCESSOR STREQUAL "x86"))
  set(USE_FBGEMM OFF)
 endif()

--- a/7
+++ b/7
@ -14,6 +14,7 @@
 /torch/csrc/autograd/ @albanD @soulitzer
 /torch/autograd/ @albanD @soulitzer
 /tools/autograd/ @albanD @soulitzer
+/torch/header_only_apis.txt @janeyx99
 /torch/nn/ @albanD @jbschlosser @mikaylagawarecki
 /torch/optim/ @albanD @janeyx99
 /test/test_public_bindings.py @albanD
@ -164,7 +165,6 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 # torch.export
 /torch/export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi
 /torch/_export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi
-/torch/_export/serde/schema.py @SherlockNoMad @zhxchen17

 # Dynamic Shapes
 /torch/fx/experimental/symbolic_shapes.py @bobrenjc93 @laithsakka
@ -196,8 +196,3 @@ torch/backends/cudnn/ @eqy @syed-ahmed
 /torch/utils/_cxx_pytree.py @XuehaiPan
 /torch/utils/pytree/ @XuehaiPan
 /torch/_dynamo/polyfills/pytree.py @XuehaiPan
-
-# Relating to libtorch ABI
-/torch/csrc/stable/ @janeyx99 @mikaylagawarecki
-/torch/headeronly/ @janeyx99
-/torch/header_only_apis.txt @janeyx99
--- a/README.md
+++ b/README.md
@ -243,7 +243,7 @@ git submodule update --init --recursive

 ```bash
 conda install cmake ninja
-# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above
+# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below
 pip install -r requirements.txt
 ```

@ -276,7 +276,7 @@ conda install pkg-config libuv
 pip install mkl-static mkl-include
 # Add these packages if torch.distributed is needed.
 # Distributed package support on Windows is a prototype feature and is subject to changes.
-conda install -c conda-forge libuv
+conda install -c conda-forge libuv=1.39
 ```

 #### Install PyTorch
@ -560,7 +560,7 @@ To learn more about making a contribution to Pytorch, please see our [Contributi

 PyTorch is a community-driven project with several skillful engineers and researchers contributing to it.

-PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), [Alban Desmaison](https://github.com/albanD), [Piotr Bialecki](https://github.com/ptrblck) and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means.
+PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means.
 A non-exhaustive but growing list needs to mention: [Trevor Killeen](https://github.com/killeent), [Sasank Chilamkurthy](https://github.com/chsasank), [Sergey Zagoruyko](https://github.com/szagoruyko), [Adam Lerer](https://github.com/adamlerer), [Francisco Massa](https://github.com/fmassa), [Alykhan Tejani](https://github.com/alykhantejani), [Luca Antiga](https://github.com/lantiga), [Alban Desmaison](https://github.com/albanD), [Andreas Koepf](https://github.com/andreaskoepf), [James Bradbury](https://github.com/jekbradbury), [Zeming Lin](https://github.com/ebetica), [Yuandong Tian](https://github.com/yuandong-tian), [Guillaume Lample](https://github.com/glample), [Marat Dukhan](https://github.com/Maratyszcza), [Natalia Gimelshein](https://github.com/ngimel), [Christian Sarofeen](https://github.com/csarofeen), [Martin Raison](https://github.com/martinraison), [Edward Yang](https://github.com/ezyang), [Zachary Devito](https://github.com/zdevito). <!-- codespell:ignore -->

 Note: This project is unrelated to [hughperkins/pytorch](https://github.com/hughperkins/pytorch) with the same name. Hugh is a valuable contributor to the Torch community and has helped with many things Torch and PyTorch.
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -119,8 +119,6 @@ file(GLOB_RECURSE native_mps_cpp "native/mps/*.cpp")
 file(GLOB_RECURSE native_mps_mm "native/mps/*.mm")
 file(GLOB_RECURSE native_mps_metal "native/mps/*.metal")
 file(GLOB_RECURSE native_mps_h "native/mps/*.h")
-file(GLOB_RECURSE native_sparse_mps_mm "native/sparse/mps/*.mm")
-file(GLOB_RECURSE native_mps_sparse_metal "native/sparse/mps/*.metal")

 file(GLOB native_sparse_cpp "native/sparse/*.cpp")
 file(GLOB native_quantized_cpp
@ -180,27 +178,26 @@ file(GLOB native_flash_attn_api_cpp "native/transformers/cuda/flash_attn/flash_a
 file(GLOB flash_attention_hip_hip "native/transformers/hip/flash_attn/*.hip")
 # if USE_FLASH_ATTENTION is set, ensure CK instances get generated
 if(USE_FLASH_ATTENTION)
-  if("$ENV{USE_CK_FLASH_ATTENTION}" STREQUAL "1")
-    message(STATUS "USE_CK_FLASH_ATTENTION is being deprecated. Please use USE_ROCM_CK_SDPA instead")
-    caffe2_update_option(USE_ROCM_CK_SDPA ON)
-  endif()
-  if(USE_ROCM_CK_SDPA)
-    if(DEFINED ENV{PYTORCH_ROCM_ARCH})
-      list(LENGTH PYTORCH_ROCM_ARCH NUM_ARCHS)
-      if(NUM_ARCHS GREATER 1)
-        message(WARNING "Building CK for multiple archs can increase build time considerably!
-        Consider setting PYTORCH_ROCM_ARCH env var value as the gfx arch you need to build for")
+  if(DEFINED ENV{USE_CK_FLASH_ATTENTION})
+    set(USE_CK_FLASH_ATTENTION $ENV{USE_CK_FLASH_ATTENTION})
+      if(USE_CK_FLASH_ATTENTION STREQUAL "1")
+        if(DEFINED ENV{PYTORCH_ROCM_ARCH})
+          list(LENGTH PYTORCH_ROCM_ARCH NUM_ARCHS)
+          if(NUM_ARCHS GREATER 1)
+            message(WARNING "Building CK for multiple archs can increase build time considerably!
+            Consider setting PYTORCH_ROCM_ARCH env var value as the gfx arch you need to build for")
+          endif()
+        endif()
+        message(STATUS "USE_CK_FLASH_ATTENTION is set; building PyTorch with CK Flash Attention enabled")
+        message(STATUS "Generating CK kernel instances...")
+        add_subdirectory(native/transformers/hip/flash_attn/ck)
+        file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip")
+        list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip})
+        # FAv3 Generation
+        add_subdirectory(native/transformers/hip/flash_attn/ck/fav_v3)
+        file(GLOB flash_attention_v3_hip "native/transformers/hip/flash_attn/ck/fav_v3/*.hip")
+        list(APPEND native_transformers_hip_hip ${flash_attention_v3_hip})
      endif()
-    endif()
-    message(STATUS "USE_ROCM_CK_SDPA is set; building PyTorch with CK SDPA enabled")
-    message(STATUS "Generating CK kernel instances...")
-    add_subdirectory(native/transformers/hip/flash_attn/ck)
-    file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip")
-    list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip})
-    # FAv3 Generation
-    add_subdirectory(native/transformers/hip/flash_attn/ck/fav_v3)
-    file(GLOB flash_attention_v3_hip "native/transformers/hip/flash_attn/ck/fav_v3/*.hip")
-    list(APPEND native_transformers_hip_hip ${flash_attention_v3_hip})
  endif()
  file(GLOB flash_attention_hip_aot_hip "native/transformers/hip/flash_attn/aot/*.hip")
  file(GLOB flash_attention_src_hip_hip "native/transformers/hip/flash_attn/src/*.hip")
@ -419,42 +416,39 @@ if(USE_CUDA)
 endif()

 if(USE_ROCM)
-  if((USE_FLASH_ATTENTION AND USE_ROCM_CK_SDPA) OR USE_ROCM_CK_GEMM)
-    # NOTE: The PyTorch build does not actually add_subdirectory
-    # third_party/composable_kernel or use it as a CMake library. What is used
-    # is header only, so this should be ok, except that the CMake build generates
-    # a ck/config.h. We just do that part here. Without this, the ck.h from the
-    # ROCM SDK may get accidentally used instead.
-    function(_pytorch_rocm_generate_ck_conf)
-      set(CK_ENABLE_INT8 "ON")
-      set(CK_ENABLE_FP16 "ON")
-      set(CK_ENABLE_FP32 "ON")
-      set(CK_ENABLE_FP64 "ON")
-      set(CK_ENABLE_BF16 "ON")
-      set(CK_ENABLE_FP8 "ON")
-      set(CK_ENABLE_BF8 "ON")
-      set(CK_USE_XDL "ON")
-      set(CK_USE_WMMA "ON")
-      configure_file(
-        "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
-        "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
-        )
-    endfunction()
-    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
-    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
-    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
-    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/example/ck_tile/01_fmha)
-    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
-    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
-    _pytorch_rocm_generate_ck_conf()
-  endif()
+  # NOTE: The PyTorch build does not actually add_subdirectory
+  # third_party/composable_kernel or use it as a CMake library. What is used
+  # is header only, so this should be ok, except that the CMake build generates
+  # a ck/config.h. We just do that part here. Without this, the ck.h from the
+  # ROCM SDK may get accidentally used instead.
+  function(_pytorch_rocm_generate_ck_conf)
+    set(CK_ENABLE_INT8 "ON")
+    set(CK_ENABLE_FP16 "ON")
+    set(CK_ENABLE_FP32 "ON")
+    set(CK_ENABLE_FP64 "ON")
+    set(CK_ENABLE_BF16 "ON")
+    set(CK_ENABLE_FP8 "ON")
+    set(CK_ENABLE_BF8 "ON")
+    set(CK_USE_XDL "ON")
+    set(CK_USE_WMMA "ON")
+    configure_file(
+      "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
+      "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
+      )
+  endfunction()
+  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
+  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
+  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
+  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
+  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
+  _pytorch_rocm_generate_ck_conf()

  # Next two lines are needed because TunableOp uses third-party/fmt
  list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>)
  list(APPEND ATen_HIP_DEPENDENCY_LIBS fmt::fmt-header-only)
-  if(USE_FLASH_ATTENTION AND USE_ROCM_CK_SDPA)
-    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck)
-  endif()
+if(USE_FLASH_ATTENTION)
+  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck)
+endif()
  list(APPEND ATen_HIP_SRCS
    ${ATen_HIP_SRCS}
    ${hip_hip}
@ -464,17 +458,12 @@ if(USE_ROCM)
    ${native_quantized_hip_hip}
    ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
  )
-  if(NOT USE_ROCM_CK_GEMM)
+  if(WIN32) # Windows doesn't support Composable Kernels
    file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
    file(GLOB native_hip_ck "native/hip/ck*.hip")
    exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
      ${native_hip_bgemm} ${native_hip_ck})
  endif()
-  if(WIN32) # Windows doesn't support Composable Kernels and Triton
-    exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
-      ${native_transformers_hip_hip} ${native_transformers_hip_cpp})
-  endif()
-
  # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
  list(APPEND all_hip_cpp
    ${native_nested_hip_cpp}
@ -709,25 +698,29 @@ endif()
 if(USE_MPS)
    include(../../../cmake/Metal.cmake)

-    set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h} ${native_sparse_mps_mm})
+    set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h})

    if(CAN_COMPILE_METAL)
-        foreach(SHADER ${native_mps_metal} ${native_mps_sparse_metal})
+        foreach(SHADER ${native_mps_metal})
            cmake_path(GET SHADER STEM TGT_STEM)
-            string(CONCAT TGT_BASIC ${TGT_STEM} "_31.air")
+            string(CONCAT TGT_BASIC ${TGT_STEM} "_30.air")
+            string(CONCAT TGT_BFLOAT ${TGT_STEM} "_31.air")
            list(APPEND AIR_BASIC ${TGT_BASIC})
-            metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.1")
+            list(APPEND AIR_BFLOAT ${TGT_BFLOAT})
+            metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.0")
+            metal_to_air(${SHADER} ${TGT_BFLOAT} "-std=metal3.1")
        endforeach()
        air_to_metallib(kernels_basic.metallib ${AIR_BASIC})
+        air_to_metallib(kernels_bfloat.metallib ${AIR_BFLOAT})
        add_custom_command(
                          COMMAND echo "// $$(date)" > metallib_dummy.cpp
-                          DEPENDS kernels_basic.metallib
+                          DEPENDS kernels_basic.metallib kernels_bfloat.metallib
                          OUTPUT metallib_dummy.cpp
                          COMMENT "Updating metallibs timestamp")
-        add_custom_target(metallibs DEPENDS kernels_basic.metallib metallib_dummy.cpp)
+        add_custom_target(metallibs DEPENDS kernels_basic.metallib kernels_bfloat.metallib metallib_dummy.cpp)
    else()
        file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps")
-        foreach(SHADER ${native_mps_metal} ${native_mps_sparse_metal})
+        foreach(SHADER ${native_mps_metal})
            cmake_path(GET SHADER STEM TGT_STEM)
            string(CONCAT SHADER_HDR_NAME  "${CMAKE_CURRENT_BINARY_DIR}" /native/mps/ ${TGT_STEM} "_metallib.h")
            metal_to_metallib_h(${SHADER} ${SHADER_HDR_NAME})
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -480,9 +480,6 @@ at::BlasBackend Context::blasPreferredBackend() {
  // call site for blasPreferredBackend(), we set it to an actual value.
  if (blas_preferred_backend == at::BlasBackend::Default) {
    blas_preferred_backend = at::BlasBackend::Cublas;
-    // This logic sits in the getter because it needs to validate
-    // values set via env vars such as TORCH_BLAS_PREFER_CUBLASLT
-    // which initialize the backend without calling the setter
 #ifdef USE_ROCM
    // AMD Instinct targets prefer hipblaslt
    static const bool hipblaslt_preferred = []() {
@ -512,10 +509,6 @@ at::BlasBackend Context::blasPreferredBackend() {
  // hipblaslt support for all archs is not as complete as hipblas
  if (blas_preferred_backend == at::BlasBackend::Cublaslt) {
    static const bool hipblaslt_unsupported = []() {
-      if(!hasCuBLASLt())
-      {
-          return true;
-      }
      static const std::vector<std::string> archs = {
          "gfx90a", "gfx942",
 #if ROCM_VERSION >= 60300
@ -541,24 +534,6 @@ at::BlasBackend Context::blasPreferredBackend() {
  return blas_preferred_backend;
 }

-bool Context::ckSupported() {
-#ifdef USE_ROCM
-  static const std::vector<std::string> supported_archs = {
-    "gfx90a", "gfx942", "gfx950"
-  };
-  for (auto index : c10::irange(detail::getCUDAHooks().deviceCount())) {
-    if(!detail::getCUDAHooks().isGPUArch(supported_archs, index)) {
-      TORCH_WARN_ONCE(
-        "Attempting to use CK on an unsupported architecture! Cannot set backend to CK");
-      return false;
-    }
-  }
-  return true;
-#else
-  return false;
-#endif
-}
-
 void Context::setBlasPreferredBackend(at::BlasBackend b) {
 #ifdef _MSC_VER
  TORCH_WARN_ONCE(
@ -568,14 +543,8 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) {
 #else
  TORCH_CHECK((b != at::BlasBackend::Cublaslt) || hasCuBLASLt(),
      "Cannot set preferred backend to cuBLASLt if PyTorch has not been compiled with cuBLASLt.");
-#ifdef USE_ROCM
-  static const bool ckSupportedFlag = ckSupported();
-  static const bool hasCKGEMMFlag = hasCKGEMM();
-  TORCH_CHECK((b != at::BlasBackend::Ck) || (ckSupportedFlag && hasCKGEMMFlag),
-      "Cannot set preferred blas backend to CK since following conditions are not true: ",
-      "architecture supported for CK: ", ckSupportedFlag,
-      ", PyTorch built with CK GEMM support: ", hasCKGEMMFlag);
-#endif
+  TORCH_CHECK((b != at::BlasBackend::Ck) || hasROCM(),
+      "Cannot set preferred backend to Ck if PyTorch has not been compiled for ROCm.");
  if (b != at::BlasBackend::Default && b != at::BlasBackend::Cublas) {
    TORCH_WARN_ONCE(
      "torch.backends.cuda.preferred_blas_library is an experimental feature. "
@ -587,40 +556,35 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) {
 #endif
 }

-at::ROCmFABackend Context::getROCmFAPreferredBackend() {
-#ifdef USE_ROCM
-  // Set potential "Default" value so we don't have to interpret at call sites.
-  // We use aotriton backend as the default, for now.
-  if(rocm_fa_preferred_backend == at::ROCmFABackend::Default) {
-    rocm_fa_preferred_backend = at::ROCmFABackend::AOTriton;
-  } else if (rocm_fa_preferred_backend == at::ROCmFABackend::Ck) {
-    // This logic sits in the getter because it needs to validate
-    // values set via env vars such as TORCH_ROCM_FA_PREFER_CK
-    // which initialize the backend without calling the setter
-    // Perform validity checking
-    static const bool hasCKSDPAFlag = hasCKSDPA();
-    static const bool ckSupportedFlag = ckSupported();
-    if(!(hasCKSDPAFlag && ckSupportedFlag)){
-      TORCH_WARN_ONCE(
-        "Cannot set preferred SDPA backend to CK since following conditions are not true: ",
-        "architecture supported for CK: ", ckSupportedFlag,
-        ", PyTorch built with CK SDPA support: ", hasCKSDPAFlag);
-      rocm_fa_preferred_backend = at::ROCmFABackend::AOTriton;
-    }
-  }
-#endif
-
+at::ROCmFABackend Context::getROCmFAPreferredBackend() const {
  return rocm_fa_preferred_backend;
 }

 void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
+
+  // TODO: add plumbing for hasCK for validity checking
+  TORCH_CHECK((b != at::ROCmFABackend::Ck) || hasROCM(),
+      "Cannot set preferred flash attention backend to Ck if PyTorch has not been compiled for ROCm.");
 #ifdef USE_ROCM
-  static const bool hasCKSDPAFlag = hasCKSDPA();
-  static const bool ckSupportedFlag = ckSupported();
-  TORCH_CHECK((b != at::ROCmFABackend::Ck) || (hasCKSDPAFlag && ckSupportedFlag),
-      "Cannot set preferred SDPA backend to CK since following conditions are not true: ",
-      "architecture supported for CK: ", ckSupportedFlag,
-      ", PyTorch built with CK SDPA support: ", hasCKSDPAFlag);
+  if(b == at::ROCmFABackend::Ck) {
+    static const bool ck_unsupported = []() {
+      static const std::vector<std::string> archs = {
+          "gfx90a",  "gfx942"
+      };
+      for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
+        if (!detail::getCUDAHooks().isGPUArch(archs, index)) {
+          TORCH_WARN_ONCE(
+            "Attempting to use CK on an unsupported architecture! Cannot set backend to CK");
+          return true;
+        }
+      }
+      return false;
+    }();
+    if(!ck_unsupported) rocm_fa_preferred_backend = b;
+  }
+  else {
+     rocm_fa_preferred_backend = b;
+  }
 #endif
  rocm_fa_preferred_backend = b;
 }
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -132,7 +132,6 @@ class TORCH_API Context {
  static bool hasKleidiAI();
  static bool hasLAPACK();
  static bool hasMKLDNN();
-  static bool ckSupported();
  static bool hasMAGMA() {
    return detail::getCUDAHooks().hasMAGMA();
  }
@ -163,12 +162,6 @@ class TORCH_API Context {
  static bool hasROCM() {
    return detail::getCUDAHooks().hasROCM();
  }
-  static bool hasCKSDPA() {
-    return detail::getCUDAHooks().hasCKSDPA();
-  }
-  static bool hasCKGEMM() {
-    return detail::getCUDAHooks().hasCKGEMM();
-  }
  static bool hasHIP() {
    return detail::getHIPHooks().hasHIP();
  }
@ -259,7 +252,7 @@ class TORCH_API Context {
  at::BlasBackend blasPreferredBackend();
  void setBlasPreferredBackend(at::BlasBackend);

-  at::ROCmFABackend getROCmFAPreferredBackend();
+  at::ROCmFABackend getROCmFAPreferredBackend() const;
  void setROCmFAPreferredBackend(at::ROCmFABackend);

  // Note [Enabling Deterministic Operations]
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -1,6 +1,5 @@
 #pragma once

-#include <c10/core/CachingDeviceAllocator.h>
 #include <c10/core/DeviceType.h>
 #include <c10/macros/Macros.h>

@ -73,27 +72,6 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
 // original device index that was active before the change.
 TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);

-TORCH_API inline void emptyCache() {
-  const auto device_type = getAccelerator(true).value();
-  at::getDeviceAllocator(device_type)->emptyCache();
-}
-
-TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
-    c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
-}
-
-TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
-}
-
-TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
-}
-
 } // namespace at::accelerator

 namespace at {
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@ -31,9 +31,7 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
      return at::globalContext().getPinnedMemoryAllocator(opt_device_type);
    } else {
      TORCH_CHECK(
-          false,
-          "pin_memory=True requires a CUDA or other accelerator backend; "
-          "no pinned memory allocator is available on this system.")
+          false, "Need to provide pin_memory allocator to use pin memory.")
    }
  }

--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@ -10,6 +10,10 @@
 #include <ideep.hpp>
 #endif

+#if !defined(__s390x__) && !defined(__powerpc__)
+#include <cpuinfo.h>
+#endif
+
 #include <caffe2/core/common.h>

 #include <ATen/native/DispatchStub.h>
@ -103,7 +107,9 @@ std::string get_cpu_capability() {
 #elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
    case native::CPUCapability::ZVECTOR:
      return "Z VECTOR";
-#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
+#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
+    case native::CPUCapability::SVE:
+      return "SVE";
    case native::CPUCapability::SVE256:
      return "SVE256";
 #else
@ -118,6 +124,12 @@ std::string get_cpu_capability() {
  return "";
 }

+int get_sve_len() {
+  // It is possible that we override the cpu_capability with
+  // environment variable
+  return cpuinfo_get_max_arm_sve_length();
+}
+
 static std::string used_cpu_capability() {
  // It is possible that we override the cpu_capability with
  // environment variable
--- a/aten/src/ATen/Version.h
+++ b/aten/src/ATen/Version.h
@ -15,4 +15,6 @@ TORCH_API std::string get_cxx_flags();

 TORCH_API std::string get_cpu_capability();

+TORCH_API int get_sve_len();
+
 } // namespace at
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@ -239,7 +239,6 @@ TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) {
  KERNEL_MPS(scaled_dot_product_attention, lower_precision_fp)

  // fp32
-  KERNEL_MPS(conv_transpose3d, input, fp32)
  KERNEL_MPS(acos, fp32)
  KERNEL_MPS(asin, fp32)
  KERNEL_MPS(cosh, fp32)
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -97,8 +97,6 @@ c10::TypePtr IValue::TagType<c10::Type>::get(const IValue& v) {
        return ComplexType::get();
      case Tag::Int:
        return IntType::get();
-      case Tag::UInt:
-        return IntType::get();
      case Tag::SymInt:
        return c10::SymIntType::get();
      case Tag::SymFloat:
@ -322,8 +320,6 @@ IValue IValue::equals(const IValue& rhs) const {
      return rhs.isComplexDouble() && lhs.toComplexDouble() == rhs.toComplexDouble();
    case Tag::Int:
      return rhs.isInt() && lhs.toInt() == rhs.toInt();
-    case Tag::UInt:
-      return rhs.isUnsigned() && lhs.toUInt() == rhs.toUInt();
    case Tag::SymInt:
      return rhs.isSymInt() && lhs.toSymInt() == rhs.toSymInt();
    case Tag::SymFloat:
@ -383,8 +379,6 @@ size_t IValue::hash(const IValue& v) {
    case Tag::Int:
      return c10::get_hash(v.payload.u.as_int);
    // NB: these are technically strict aliasing violations
-    case Tag::UInt:
-      return c10::get_hash(v.payload.u.as_int);
    case Tag::SymInt:
      return c10::get_hash(v.payload.u.as_int);
    case Tag::SymFloat:
@ -812,8 +806,6 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
      return printComplex(out, v);
    } case IValue::Tag::Int:
      return out << v.toInt();
-    case IValue::Tag::UInt:
-      return out << v.toUInt();
    case IValue::Tag::SymInt:
      return out << v.toSymInt();
    case IValue::Tag::SymFloat:
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@ -12,7 +12,6 @@
 #include <c10/macros/Export.h>
 #include <c10/util/MaybeOwned.h>
 #include <c10/util/intrusive_ptr.h>
-#include <limits>
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
@ -161,7 +160,6 @@ struct Capsule {
  _(Double)                  \
  _(ComplexDouble)           \
  _(Int)                     \
-  _(UInt)                    \
  _(SymInt)                  \
  _(SymFloat)                \
  _(SymBool)                 \
@ -655,29 +653,6 @@ struct TORCH_API IValue final {
    }
  }

-  // Unsigned
-  IValue(uint64_t u) : tag( u <= std::numeric_limits<int64_t>::max() ? Tag::Int : Tag::UInt) {
-    payload.u.as_uint = u;
-  }
-
-
-  // See Note [Meaning of HAS_u]
-  // IValue type model closely follows that of c10::Scalar
-  // Where all integers are upcast to 64-bit representation, and `as_int` is used as default
-  // representation unless value could not be represented as signed int
-  bool isUnsigned() const {
-    return Tag::UInt == tag || (Tag::Int == tag && payload.u.as_int >= 0);
-  }
-
-  uint64_t toUInt() const {
-    if (isUnsigned()) {
-      return payload.u.as_uint;
-    } else {
-      TORCH_INTERNAL_ASSERT(0, "expected unsigned int");
-    }
-  }
-
-
  // Bool
  IValue(bool b) : tag(Tag::Bool) {
 #if defined(__clang__) && defined(__x86_64__)
@ -918,14 +893,8 @@ struct TORCH_API IValue final {
    } else {
      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
          s.isIntegral(false), "Unknown type in Scalar");
-      if (s.isUnsigned()) {
-        const auto val = s.toUInt64();
-        payload.u.as_uint = val;
-        tag = val <= std::numeric_limits<int64_t>::max() ? Tag::Int : Tag::UInt;
-      } else {
-        payload.u.as_int = s.toLong();
-        tag = Tag::Int;
-      }
+      tag = Tag::Int;
+      payload.u.as_int = s.toLong();
    }
  }

@ -949,8 +918,6 @@ struct TORCH_API IValue final {
      return toSymFloat();
    else if (isSymBool())
      return toSymBool();
-    else if (isUnsigned())
-      return toUInt();
    TORCH_CHECK(false, "IValue is not a Scalar");
  }

@ -1280,8 +1247,6 @@ struct TORCH_API IValue final {
        return true;
      case Tag::Int:
        return false;
-      case Tag::UInt:
-        return false;
      case Tag::SymInt:
        return true;
      case Tag::SymFloat:
@ -1378,8 +1343,6 @@ struct TORCH_API IValue final {
    union TriviallyCopyablePayload {
      TriviallyCopyablePayload() : as_int(0) {}
      int64_t as_int;
-      // See Note [Meaning of HAS_u]
-      uint64_t as_uint;
      double as_double;
      bool as_bool;
      // Invariant: never nullptr; null state is represented as
--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@ -34,9 +34,9 @@ inline scalar_t vec_reduce_all(
  scalar_t acc_arr[Vec::size()];
  acc_vec.store(acc_arr);
  for (const auto i : c10::irange(1, size)) {
-    std::array<scalar_t, Vec::size()> acc_arr_next = {0};
+    scalar_t acc_arr_next[Vec::size()] = {0};
    acc_arr_next[0] = acc_arr[i];
-    Vec acc_vec_next = Vec::loadu(acc_arr_next.data());
+    Vec acc_vec_next = Vec::loadu(acc_arr_next);
    acc_vec = vec_fun(acc_vec, acc_vec_next);
  }
  acc_vec.store(acc_arr);
@ -102,8 +102,7 @@ struct VecReduceAllSIMD<float, Op> {
 #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
       // !defined(C10_MOBILE)

-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    !defined(CPU_CAPABILITY_SVE)
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
  static inline float apply(
@ -143,8 +142,7 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
 #endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
       // && !defined(CPU_CAPABILITY_SVE)

-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    defined(CPU_CAPABILITY_SVE256)
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE))
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
  static inline float apply(
--- a/aten/src/ATen/cpu/vec/sve/sve_helper.h
+++ b/aten/src/ATen/cpu/vec/sve/sve_helper.h
@ -4,7 +4,7 @@

 #include <ATen/cpu/vec/vec_base.h>

-#if defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)

 // Define the data type of VLS(vector-length specific).
 typedef svbool_t vls_pred_t
@ -77,4 +77,4 @@ typedef svfloat64_t vls_float64_t
 #define ALL_F64_TRUE_MASK svreinterpret_f64_s64(ALL_S64_TRUE_MASK)
 #define ALL_F64_FALSE_MASK svreinterpret_f64_s64(ALL_S64_FALSE_MASK)

-#endif // defined(CPU_CAPABILITY_SVE)
+#endif // defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
@ -19,7 +19,7 @@ namespace vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
+#if (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)) && defined(__ARM_FEATURE_BF16)

 template <>
 struct is_vec_specialized_for<BFloat16> : std::bool_constant<true> {};
@ -230,8 +230,6 @@ __attribute__((optimize("no-tree-vectorize")))
 #endif
 inline std::tuple<Vectorized<float>, Vectorized<float>>
 convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
-  static_assert(
-      Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
  auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
  auto bf16_vec1 = svzip1_bf16(zero, a);
  auto bf16_vec2 = svzip2_bf16(zero, a);
@ -243,19 +241,18 @@ convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
 inline Vectorized<c10::BFloat16> convert_float_bfloat16(
    const Vectorized<float>& a,
    const Vectorized<float>& b) {
-  static_assert(
-      Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
  svbfloat16_t x1 = svcvt_bf16_f32_z(ptrue, a);
  svbfloat16_t x2 = svcvt_bf16_f32_z(ptrue, b);
  return Vectorized<c10::BFloat16>(svuzp1_bf16(x1, x2));
 }

 inline void load_fp32_from_bf16(const BFloat16* data, Vectorized<float>& out) {
-  __at_align__ float values[Vectorized<float>::size()];
+  __at_align__ float * values = new float[Vectorized<float>::size()];
  for (const auto k : c10::irange(Vectorized<float>::size())) {
    values[k] = data[k];
  }
  out = Vectorized<float>::loadu(values);
+  delete[] values;
 }

 inline void load_fp32_from_bf16(
@ -308,8 +305,8 @@ Vectorized<c10::BFloat16> inline operator/(
 }

 inline Vectorized<BFloat16>::Vectorized() {
-  const short zero = 0;
-  values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero));
+  auto vals_f = svdup_n_f32(0);
+  values = convert_float_bfloat16(vals_f, vals_f);
 }

 inline Vectorized<BFloat16>::Vectorized(int val) {
--- a/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
@ -8,7 +8,7 @@
 #include <ATen/cpu/vec/sve/sve_helper.h>
 #include <ATen/cpu/vec/vec_base.h>

-#if defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
 #include <ATen/cpu/vec/sve/vec_bfloat16.h>
 #include <ATen/cpu/vec/sve/vec_double.h>
 #include <ATen/cpu/vec/sve/vec_float.h>
@ -27,7 +27,7 @@ namespace at::vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #define DEFINE_SVE_CAST(t1_t, t1_prefix, t2_t, t2_prefix)                 \
@ -231,6 +231,5 @@ std::pair<
 #endif // __ARM_FEATURE_BF16

 #endif // defined(CPU_CAPABILITY_SVE)
-
 } // namespace CPU_CAPABILITY
-} // namespace at::vec
+}
--- a/aten/src/ATen/cpu/vec/sve/vec_double.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_double.h
@ -22,7 +22,7 @@ namespace at::vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)

 template <>
 struct is_vec_specialized_for<double> : std::bool_constant<true> {};
@ -55,10 +55,11 @@ class Vectorized<double> {
  operator svfloat64_t() const {
    return values;
  }
-  template <uint64_t mask>
  static Vectorized<double> blend(
      const Vectorized<double>& a,
-      const Vectorized<double>& b) {
+      const Vectorized<double>& b,
+      int64_t mask
+    ) {
    // Build an array of flags: each element is 1 if the corresponding bit in
    // 'mask' is set, 0 otherwise.
    __at_align__ int64_t flag_arr[size()];
--- a/aten/src/ATen/cpu/vec/sve/vec_float.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_float.h
@ -2,8 +2,10 @@

 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/sve/sve_helper.h>
-#include <ATen/cpu/vec/vec_base.h>
+
+#include <algorithm>
 #include <cmath>
+
 #if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
 #include <sleef.h>
 #define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
@ -22,7 +24,7 @@ namespace at::vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)

 template <>
 struct is_vec_specialized_for<float> : std::bool_constant<true> {};
@ -30,52 +32,77 @@ struct is_vec_specialized_for<float> : std::bool_constant<true> {};
 template <>
 class Vectorized<float> {
 private:
-  vls_float32_t values;
-
+    __at_align__ float values[2048 / sizeof(float)];
 public:
+
  using value_type = float;
  using size_type = int;
-  static constexpr size_type size() {
-    return VECTOR_WIDTH / sizeof(float);
+  static inline size_type size() {
+    return svcntw();
  }
-  Vectorized() {
-    values = svdup_n_f32(0);
+  inline Vectorized() {svst1_f32(ptrue, values, svdup_n_f32(0));}
+  inline Vectorized(const float val) {
+    svst1_f32(ptrue, values, svdup_n_f32(val));
  }
-  Vectorized(svfloat32_t v) : values(v) {}
-  Vectorized(float val) {
-    values = svdup_n_f32(val);
+  inline Vectorized(const svfloat32_t val) {
+    svst1_f32(ptrue, values, val);
  }
-  template <
-      typename... Args,
-      typename = std::enable_if_t<(sizeof...(Args) == size())>>
-  Vectorized(Args... vals) {
-    __at_align__ float buffer[size()] = {vals...};
-    values = svld1_f32(ptrue, buffer);
+  template<typename T,
+           typename = std::enable_if_t<std::is_pointer_v<T>>>
+  inline Vectorized(float * val) {
+    svst1_f32(ptrue, values, svld1_f32(ptrue, val));
  }
-  operator svfloat32_t() const {
-    return values;
+  template<typename... Args,
+           typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  inline Vectorized(Args... vals) {
+    values = { vals... };
  }
-  template <uint64_t mask>
-  static Vectorized<float> blend(
-      const Vectorized<float>& a,
-      const Vectorized<float>& b) {
-    // Build an array of flags: each element is 1 if the corresponding bit in
-    // 'mask' is set, 0 otherwise.
-    __at_align__ int32_t flag_arr[size()];
+  inline operator svfloat32_t() const {
+    return svld1_f32(ptrue, values);
+  }
+  static inline Vectorized<float> from_ptr(const float * vs) {
+    Vectorized<float> v;
+    svst1_f32(ptrue, v.values, svld1_f32(ptrue, static_cast<const float *>(vs)));
+    return v;
+  }
+  static inline Vectorized<float> from_ptr(const float * vs, int count) {
+    Vectorized<float> v;
+    svst1_f32(ptrue, v.values, svld1_f32(svwhilelt_b32_s32(0, count), static_cast<const float *>(vs)));
+    return v;
+  }
+  inline void set_lane(int i, float value) {
+    values[i] = value;
+  }
+  inline Vectorized<float> map(float (*fn)(float)) const {
+    Vectorized<float> result;
+    for (int64_t i = 0; i < size(); ++i) {
+      result.set_lane(i, fn(values[i]));
+    }
+    return result;
+  }
+  inline Vectorized<float> map2(float (*fn)(float, float), const Vectorized<float> &b) const {
+    Vectorized<float> result;
+    for (int64_t i = 0; i < size(); ++i) {
+      result.set_lane(i, fn(values[i], b.values[i]));
+    }
+    return result;
+  }
+
+  static inline Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b, const uint64_t mask) {
+    // Build an array of flags: each element is 1 if the corresponding bit in 'mask' is set, 0 otherwise.
+    __at_align__ int32_t * flag_arr = new int32_t[size()];
    for (int i = 0; i < size(); i++) {
      flag_arr[i] = (mask & (1ULL << i)) ? 1 : 0;
    }
    // Load the flag array into an SVE int32 vector.
-    svint32_t int_mask = svld1_s32(svptrue_b32(), flag_arr);
-    // Compare each lane of int_mask to 0; returns an svbool_t predicate where
-    // true indicates a nonzero flag.
-    svbool_t blend_mask = svcmpne_n_s32(svptrue_b32(), int_mask, 0);
-    // Use svsel to select elements from b where the predicate is true, else
-    // from a.
-    svfloat32_t result = svsel_f32(blend_mask, b.values, a.values);
-    return Vectorized<float>(result);
+    svint32_t int_mask = svld1_s32(ptrue, flag_arr);
+    delete[] flag_arr;
+    // Compare each lane of int_mask to 0; returns an svbool_t predicate where true indicates a nonzero flag.
+    svbool_t blend_mask = svcmpne_n_s32(ptrue, int_mask, 0);
+    // Use svsel to select elements from b where the predicate is true, else from a.
+    return svsel_f32(blend_mask, b, a);
  }
-  static Vectorized<float> blendv(
+  static inline Vectorized<float> blendv(
      const Vectorized<float>& a,
      const Vectorized<float>& b,
      const Vectorized<float>& mask_) {
@ -84,16 +111,18 @@ class Vectorized<float> {
    return svsel_f32(mask, b, a);
  }
  template <typename step_t>
-  static Vectorized<float> arange(
+  static inline Vectorized<float> arange(
      float base = 0.f,
      step_t step = static_cast<step_t>(1)) {
-    __at_align__ float buffer[size()];
+    __at_align__ float * buffer = new float[size()];
    for (int64_t i = 0; i < size(); i++) {
      buffer[i] = base + i * step;
    }
-    return svld1_f32(ptrue, buffer);
+    auto tmp = Vectorized<float>::from_ptr(buffer);
+    delete[] buffer;
+    return tmp;
  }
-  static Vectorized<float> set(
+  static inline Vectorized<float> set(
      const Vectorized<float>& a,
      const Vectorized<float>& b,
      int64_t count = size()) {
@ -169,271 +198,219 @@ class Vectorized<float> {
    poly = svsel_f32(svcmpgt_f32(pg, x, max_input), inf, poly);
    return poly;
  }
-  static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
-    if (count == size())
-      return svld1_f32(ptrue, reinterpret_cast<const float*>(ptr));
-    svbool_t pg = svwhilelt_b32(0ull, count);
-    return svld1_f32(pg, reinterpret_cast<const float*>(ptr));
+  static inline Vectorized<float> loadu(const void* ptr) {
+    return Vectorized<float>::from_ptr(reinterpret_cast<const float *>(ptr));
  }
-  void store(void* ptr, int64_t count = size()) const {
-    if (count == size()) {
-      svst1_f32(ptrue, reinterpret_cast<float*>(ptr), values);
-    } else {
-      svbool_t pg = svwhilelt_b32(0ull, count);
-      svst1_f32(pg, reinterpret_cast<float*>(ptr), values);
-    }
+  static inline Vectorized<float> loadu(const void* ptr, int64_t count) {
+    return Vectorized<float>::from_ptr(reinterpret_cast<const float *>(ptr), count);
  }
-  const float& operator[](int idx) const = delete;
-  float& operator[](int idx) = delete;
-  int64_t zero_mask() const {
-    // returns an integer mask where all zero elements are translated to 1-bit
-    // and others are translated to 0-bit
+  inline void store(void* ptr) const {
+    svst1_f32(ptrue, static_cast<float *>(ptr), svld1_f32(ptrue, values));
+  }
+  inline void store(void* ptr, int count) const {
+    svst1_f32(svwhilelt_b32_s32(0, count), static_cast<float *>(ptr), svld1_f32(ptrue, values));
+  }
+  inline const float& operator[](int idx) const {
+    return values[idx];
+  };
+  inline float& operator[](int idx) {
+    return values[idx];
+  };
+  inline int64_t zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
    int64_t mask = 0;
-    __at_align__ int32_t mask_array[size()];
+    __at_align__ int32_t * mask_array = new int32_t[size()];

-    svbool_t svbool_mask = svcmpeq_f32(ptrue, values, ZERO_F32);
-    svst1_s32(
-        ptrue,
-        mask_array,
-        svsel_s32(svbool_mask, ALL_S32_TRUE_MASK, ALL_S32_FALSE_MASK));
-    for (int64_t i = 0; i < size(); ++i) {
-      if (mask_array[i])
-        mask |= (1ull << i);
+    svbool_t svbool_mask = svcmpeq_f32(ptrue, *this, ZERO_F32);
+    svst1_s32(ptrue, mask_array, svsel_s32(svbool_mask,
+                                          ALL_S32_TRUE_MASK,
+                                          ALL_S32_FALSE_MASK));
+    for (int64_t j = 0; j < size(); ++j) {
+      if (mask_array[j]) mask |= (1ull << j);
    }
+    delete[] mask_array;
    return mask;
  }
-  Vectorized<float> isnan() const {
+  inline Vectorized<float> isnan() const {
    // NaN check
-    svbool_t mask = svcmpuo_f32(ptrue, values, ZERO_F32);
+    auto mask = svcmpuo_f32(ptrue, *this, ZERO_F32);
    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
  }
-  bool has_inf_nan() const {
-    return svptest_any(
-        ptrue,
-        svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32));
+  inline bool has_inf_nan() const {
+    return svptest_any(ptrue, svcmpuo_f32(ptrue, svsub_f32_x(ptrue, *this, *this), ZERO_F32));
  }
-  Vectorized<float> map(float (*f)(float)) const {
-    __at_align__ float tmp[size()];
-    store(tmp);
-    for (int64_t i = 0; i < size(); ++i) {
-      tmp[i] = f(tmp[i]);
-    }
-    return loadu(tmp);
+  
+  inline Vectorized<float> abs() const {
+    return svabs_f32_x(ptrue, *this);
  }
-  Vectorized<float> abs() const {
-    return svabs_f32_x(ptrue, values);
-  }
-  Vectorized<float> angle() const {
+  inline Vectorized<float> angle() const {
    const auto nan_vec = svdup_n_f32(NAN);
-    const auto nan_mask = svcmpuo_f32(ptrue, values, ZERO_F32);
+    const auto nan_mask = svcmpuo_f32(ptrue, *this, ZERO_F32);
    const auto pi = svdup_n_f32(c10::pi<float>);
-
-    const auto neg_mask = svcmplt_f32(ptrue, values, ZERO_F32);
+    const auto neg_mask = svcmplt_f32(ptrue, *this, ZERO_F32);
    auto angle = svsel_f32(neg_mask, pi, ZERO_F32);
-    angle = svsel_f32(nan_mask, nan_vec, angle);
-    return angle;
+    return svsel_f32(nan_mask, nan_vec, angle);
  }
-  Vectorized<float> real() const {
-    return values;
+  inline Vectorized<float> real() const {
+    return *this;
  }
-  Vectorized<float> imag() const {
+  inline Vectorized<float> imag() const {
    return Vectorized<float>(0.f);
  }
-  Vectorized<float> conj() const {
-    return values;
+  inline Vectorized<float> conj() const {
+    return *this;
  }
-  Vectorized<float> acos() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_acosfx_u10sve(values)), map(std::acos));
+  inline Vectorized<float> acos() const {
+    return USE_SLEEF(Sleef_acosfx_u10sve(*this), map(std::acos));
  }
-  Vectorized<float> acosh() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_acoshfx_u10sve(values)), map(std::acosh));
+  inline Vectorized<float> acosh() const {
+    return USE_SLEEF(Sleef_acoshfx_u10sve(*this), map(std::acosh));
  }
-  Vectorized<float> asin() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_asinfx_u10sve(values)), map(std::asin));
+  inline Vectorized<float> asin() const {
+    return USE_SLEEF(Sleef_asinfx_u10sve(*this), map(std::asin));
  }
-  Vectorized<float> asinh() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_asinhfx_u10sve(values)), map(std::asinh));
+  inline Vectorized<float> asinh() const {
+    return USE_SLEEF(Sleef_asinhfx_u10sve(*this), map(std::asinh));
  }
-  Vectorized<float> atan() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_atanfx_u10sve(values)), map(std::atan));
+  inline Vectorized<float> atan() const {
+    return USE_SLEEF(Sleef_atanfx_u10sve(*this), map(std::atan));
  }
-  Vectorized<float> atanh() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_atanhfx_u10sve(values)), map(std::atanh));
+  inline Vectorized<float> atanh() const {
+    return USE_SLEEF(Sleef_atanhfx_u10sve(*this), map(std::atanh));
  }
-  Vectorized<float> atan2(const Vectorized<float>& b) const {USE_SLEEF(
-      { return Vectorized<float>(Sleef_atan2fx_u10sve(values, b)); },
-      {
-        __at_align__ float tmp[size()];
-        __at_align__ float tmp_b[size()];
-        store(tmp);
-        b.store(tmp_b);
-        for (int64_t i = 0; i < size(); i++) {
-          tmp[i] = std::atan2(tmp[i], tmp_b[i]);
-        }
-        return loadu(tmp);
-      })} Vectorized<float> copysign(const Vectorized<float>& sign) const {
-
-      USE_SLEEF(
-          { return Vectorized<float>(Sleef_copysignfx_sve(values, sign)); },
-          {
-            __at_align__ float tmp[size()];
-            __at_align__ float tmp_sign[size()];
-            store(tmp);
-            sign.store(tmp_sign);
-            for (int64_t i = 0; i < size(); ++i) {
-              tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
-            }
-            return loadu(tmp);
-          })} Vectorized<float> erf() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_erffx_u10sve(values)), map(std::erf));
+  inline Vectorized<float> atan2(const Vectorized<float> &b) const {
+    return USE_SLEEF(Sleef_atan2fx_u10sve(*this, b), map2(std::atan2, b));
  }
-  Vectorized<float> erfc() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_erfcfx_u15sve(values)), map(std::erfc));
+  inline Vectorized<float> copysign(const Vectorized<float> &sign) const {
+    return USE_SLEEF(Sleef_copysignfx_sve(*this, sign), map2(std::copysign, sign));
  }
-  Vectorized<float> erfinv() const {
+  inline Vectorized<float> erf() const {
+    return USE_SLEEF(Sleef_erffx_u10sve(*this), map(std::erf));
+  }
+  inline Vectorized<float> erfc() const {
+    return USE_SLEEF(Sleef_erfcfx_u15sve(*this), map(std::erfc));
+  }
+  inline Vectorized<float> erfinv() const {
    return map(calc_erfinv);
  }
-  Vectorized<float> exp() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_expfx_u10sve(values)), map(std::exp));
+  inline Vectorized<float> exp() const {
+    return USE_SLEEF(Sleef_expfx_u10sve(*this), map(std::exp));
  }
-  Vectorized<float> exp2() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_exp2fx_u10sve(values)), map(std::exp2));
+  inline Vectorized<float> exp2() const {
+    return USE_SLEEF(Sleef_exp2fx_u10sve(*this), map(std::exp2));
  }
-  Vectorized<float> expm1() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_expm1fx_u10sve(values)), map(std::expm1));
+  inline Vectorized<float> expm1() const {
+    return USE_SLEEF(Sleef_expm1fx_u10sve(*this), map(std::expm1));
  }
-  Vectorized<float> exp_u20() const {
-    return exp();
+  // Implementation copied from Arm Optimized Routines: 
+  // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/sve/expf.c
+  inline Vectorized<float> exp_u20() {
+    
+  // Load values into an SVE vector
+  svfloat32_t val_vec = svld1(svptrue_b32(), values);  // 'values' is float*
+
+  // Check for special case: |x| >= 87.3...
+  svbool_t is_special_case = svacgt(svptrue_b32(), val_vec, 0x1.5d5e2ap+6f);
+  if (svptest_any(svptrue_b32(), is_special_case)) {
+      return exp(); // fallback to scalar exp() for special cases
  }
-  Vectorized<float> fexp_u20() const {
-    return exp();
+
+  // Constants
+  const svfloat32_t ln2_hi = svdup_f32(0x1.62e4p-1f);    
+  const svfloat32_t ln2_lo = svdup_f32(0x1.7f7d1cp-20f);    
+  const svfloat32_t c1      = svdup_f32(0.5f);    
+  const svfloat32_t inv_ln2 = svdup_f32(0x1.715476p+0f);
+  const svfloat32_t shift_vec = svdup_f32(0x1.803f8p17f);  // scalar to vector
+
+  // n = round(x / ln2)
+  svfloat32_t z = svmad_x(svptrue_b32(), inv_ln2, val_vec, shift_vec);
+  svfloat32_t n = svsub_x(svptrue_b32(), z, shift_vec);
+
+  // r = x - n * ln2
+  svfloat32_t r = svsub_x(svptrue_b32(), val_vec, svmul_x(svptrue_b32(), n, ln2_hi));
+  r = svsub_x(svptrue_b32(), r, svmul_x(svptrue_b32(), n, ln2_lo));
+
+  // scale = 2^(n)
+  svfloat32_t scale = svexpa(svreinterpret_u32(z));
+
+  // poly(r) = exp(r) - 1 ≈ r + 0.5 * r^2
+  svfloat32_t r2 = svmul_x(svptrue_b32(), r, r);
+  svfloat32_t poly = svmla_x(svptrue_b32(), r, r2, c1);
+
+  // return scale * (1 + poly)
+  return svmla_x(svptrue_b32(), scale, scale, poly);
  }
-  Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF(
-      { return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
-      {
-        __at_align__ float tmp[size()];
-        __at_align__ float tmp_q[size()];
-        store(tmp);
-        q.store(tmp_q);
-        for (int64_t i = 0; i < size(); ++i) {
-          tmp[i] = std::fmod(tmp[i], tmp_q[i]);
-        }
-        return loadu(tmp);
-      })} Vectorized<float> hypot(const Vectorized<float>& b) const {
-      USE_SLEEF(
-          { return Vectorized<float>(Sleef_hypotfx_u05sve(values, b)); },
-          {
-            __at_align__ float tmp[size()];
-            __at_align__ float tmp_b[size()];
-            store(tmp);
-            b.store(tmp_b);
-            for (int64_t i = 0; i < size(); i++) {
-              tmp[i] = std::hypot(tmp[i], tmp_b[i]);
-            }
-            return loadu(tmp);
-          })} Vectorized<float> i0() const {
+
+  inline Vectorized<float> fexp_u20() {
+    return exp_u20();
+  }
+  inline Vectorized<float> fmod(const Vectorized<float>& q) const {
+    return USE_SLEEF(Sleef_fmodfx_sve(*this, q), return map2(std::fmod, q));
+  }
+  inline Vectorized<float> hypot(const Vectorized<float> &b) const {
+   return USE_SLEEF(Sleef_hypotfx_u05sve(*this, b), map2(std::hypot, b));
+  }
+  inline Vectorized<float> i0() const {
    return map(calc_i0);
  }
-  Vectorized<float> i0e() const {
-    return map(calc_i0e);
+  inline Vectorized<float> i0e() const {
+    return map(calc_i0e<float>);
  }
-  Vectorized<float> digamma() const {
+  inline Vectorized<float> digamma() const {
    return map(calc_digamma);
  }
-  Vectorized<float> igamma(const Vectorized<float>& x) const {
-    __at_align__ float tmp[size()];
-    __at_align__ float tmp_x[size()];
-    store(tmp);
-    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
-      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
-    }
-    return loadu(tmp);
+  inline Vectorized<float> igamma(const Vectorized<float> &x) const {
+    return map2(calc_igamma<float>, x);
  }
-  Vectorized<float> igammac(const Vectorized<float>& x) const {
-    __at_align__ float tmp[size()];
-    __at_align__ float tmp_x[size()];
-    store(tmp);
-    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
-      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
-    }
-    return loadu(tmp);
+  inline Vectorized<float> igammac(const Vectorized<float> &x) const {
+    return map2(calc_igammac<float>, x);
  }
-  Vectorized<float> nextafter(const Vectorized<float>& b) const {USE_SLEEF(
-      { return Vectorized<float>(Sleef_nextafterfx_sve(values, b)); },
-      {
-        __at_align__ float tmp[size()];
-        __at_align__ float tmp_b[size()];
-        store(tmp);
-        b.store(tmp_b);
-        for (int64_t i = 0; i < size(); ++i) {
-          tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
-        }
-        return loadu(tmp);
-      })} Vectorized<float> log() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_logfx_u10sve(values)), map(std::log));
+  inline Vectorized<float> nextafter(const Vectorized<float> &b) const {
+    return USE_SLEEF(Sleef_nextafterfx_sve(*this, b), map2(std::nextafter, b));
  }
-  Vectorized<float> log2() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_log2fx_u10sve(values)), map(std::log2));
+  inline Vectorized<float> log() const {
+    return USE_SLEEF(Sleef_logfx_u10sve(*this), map(std::log));
  }
-  Vectorized<float> log10() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_log10fx_u10sve(values)), map(std::log10));
+  inline Vectorized<float> log2() const {
+    return USE_SLEEF(Sleef_log2fx_u10sve(*this), map(std::log2));
  }
-  Vectorized<float> log1p() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_log1pfx_u10sve(values)), map(std::log1p));
+  inline Vectorized<float> log10() const {
+    return USE_SLEEF(Sleef_log10fx_u10sve(*this), map(std::log10));
  }
-  Vectorized<float> frac() const;
-  Vectorized<float> sin() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_sinfx_u10sve(values)), map(std::sin));
+  inline Vectorized<float> log1p() const {
+    return USE_SLEEF(Sleef_log1pfx_u10sve(*this), map(std::log1p));
  }
-  Vectorized<float> sinh() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_sinhfx_u10sve(values)), map(std::sinh));
+  inline Vectorized<float> frac() const;
+  inline Vectorized<float> sin() const {
+    return USE_SLEEF(Sleef_sinfx_u10sve(*this), map(std::sin));
  }
-  Vectorized<float> cos() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_cosfx_u10sve(values)), map(std::cos));
+  inline Vectorized<float> sinh() const {
+    return USE_SLEEF(Sleef_sinhfx_u10sve(*this), map(std::sinh));
  }
-  Vectorized<float> cosh() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_coshfx_u10sve(values)), map(std::cosh));
+  inline Vectorized<float> cos() const {
+    return USE_SLEEF(Sleef_cosfx_u10sve(*this), map(std::cos));
  }
-  Vectorized<float> ceil() const {
-    return svrintp_f32_x(ptrue, values);
+  inline Vectorized<float> cosh() const {
+    return USE_SLEEF(Sleef_coshfx_u10sve(*this), map(std::cosh));
  }
-  Vectorized<float> floor() const {
-    return svrintm_f32_x(ptrue, values);
+  inline Vectorized<float> ceil() const {
+    return svrintp_f32_x(ptrue, *this);
  }
-  Vectorized<float> neg() const {
-    return svneg_f32_x(ptrue, values);
+  inline Vectorized<float> floor() const {
+    return svrintm_f32_x(ptrue, *this);
  }
-  Vectorized<float> round() const {
-    return svrinti_f32_x(ptrue, values);
+  inline Vectorized<float> neg() const {
+    return svneg_f32_x(ptrue, *this);
  }
-  Vectorized<float> tan() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_tanfx_u10sve(values)), map(std::tan));
+  inline Vectorized<float> round() const {
+    return svrinti_f32_x(ptrue, *this);
+  }
+  inline Vectorized<float> tan() const {
+    return USE_SLEEF(Sleef_tanfx_u10sve(*this), map(std::tan));
  }
  // Implementation is picked from
  // https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L179
-  Vectorized<float> tanh() const {
+  inline Vectorized<float> tanh() const {
    // Constants used for the tanh calculation.
    const svfloat32_t CONST_1 =
        svdup_n_f32(1.f); // Constant 1.0f for the tanh formula.
@ -450,7 +427,7 @@ class Vectorized<float> {
    // instability. svmax_f32_z ensures values are greater than -10, and
    // svmin_f32_z ensures they are less than 10.
    svfloat32_t x = svmin_f32_z(
-        ptrue, svmax_f32_z(ptrue, values, CONST_MIN_TANH), CONST_MAX_TANH);
+        ptrue, svmax_f32_z(ptrue, *this, CONST_MIN_TANH), CONST_MAX_TANH);

    // Step 2: Calculate exp(2 * x), where x is the clamped value.
    // svmul_f32_z computes 2 * x, and svexp_f32_z computes the exponential of
@ -472,104 +449,85 @@ class Vectorized<float> {
    // Return the calculated tanh values.
    return tanh;
  }
-  Vectorized<float> trunc() const {
-    return svrintz_f32_x(ptrue, values);
+  inline Vectorized<float> trunc() const {
+    return svrintz_f32_x(ptrue, *this);
  }
-  Vectorized<float> lgamma() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_lgammafx_u10sve(values)), map(std::lgamma));
+  inline Vectorized<float> lgamma() const {
+    return USE_SLEEF(Sleef_lgammafx_u10sve(*this), map(std::lgamma));
  }
-  Vectorized<float> sqrt() const {
-    return svsqrt_f32_x(ptrue, values);
+  inline Vectorized<float> sqrt() const {
+    return svsqrt_f32_x(ptrue, *this);
  }
-  Vectorized<float> reciprocal() const {
-    return svdivr_f32_x(ptrue, values, ONE_F32);
+  inline Vectorized<float> reciprocal() const {
+    return svdivr_f32_x(ptrue, *this, svdup_n_f32(1.f));
  }
-  Vectorized<float> rsqrt() const {
-    return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, values), ONE_F32);
+  inline Vectorized<float> rsqrt() const {
+    return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, *this), ONE_F32);
  }
-  Vectorized<float> pow(const Vectorized<float>& b) const {USE_SLEEF(
-      { return Vectorized<float>(Sleef_powfx_u10sve(values, b)); },
-      {
-        __at_align__ float tmp[size()];
-        __at_align__ float tmp_b[size()];
-        store(tmp);
-        b.store(tmp_b);
-        for (int64_t i = 0; i < size(); i++) {
-          tmp[i] = std::pow(tmp[i], tmp_b[i]);
-        }
-        return loadu(tmp);
-      })} // Comparison using the _CMP_**_OQ predicate.
-          //   `O`: get false if an operand is NaN
-          //   `Q`: do not raise if an operand is NaN
-  Vectorized<float> operator==(const Vectorized<float>& other) const {
-    svbool_t mask = svcmpeq_f32(ptrue, values, other);
+  inline Vectorized<float> pow(const Vectorized<float> &b) const {
+    return USE_SLEEF(Sleef_powfx_u10sve(*this, b), map(std::pow, b));
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  inline Vectorized<float> operator==(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpeq_f32(ptrue, *this, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+  inline Vectorized<float> operator!=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpne_f32(ptrue, *this, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+  inline Vectorized<float> operator<(const Vectorized<float>& other) const {
+    svbool_t mask = svcmplt_f32(ptrue, *this, other);
    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
  }

-  Vectorized<float> operator!=(const Vectorized<float>& other) const {
-    svbool_t mask = svcmpne_f32(ptrue, values, other);
+  inline Vectorized<float> operator<=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmple_f32(ptrue, *this, other);
    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
  }

-  Vectorized<float> operator<(const Vectorized<float>& other) const {
-    svbool_t mask = svcmplt_f32(ptrue, values, other);
+  inline Vectorized<float> operator>(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpgt_f32(ptrue, *this, other);
    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
  }

-  Vectorized<float> operator<=(const Vectorized<float>& other) const {
-    svbool_t mask = svcmple_f32(ptrue, values, other);
+  inline Vectorized<float> operator>=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpge_f32(ptrue, *this, other);
    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
  }

-  Vectorized<float> operator>(const Vectorized<float>& other) const {
-    svbool_t mask = svcmpgt_f32(ptrue, values, other);
-    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
-  }
-
-  Vectorized<float> operator>=(const Vectorized<float>& other) const {
-    svbool_t mask = svcmpge_f32(ptrue, values, other);
-    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
-  }
-
-  Vectorized<float> eq(const Vectorized<float>& other) const;
-  Vectorized<float> ne(const Vectorized<float>& other) const;
-  Vectorized<float> gt(const Vectorized<float>& other) const;
-  Vectorized<float> ge(const Vectorized<float>& other) const;
-  Vectorized<float> lt(const Vectorized<float>& other) const;
-  Vectorized<float> le(const Vectorized<float>& other) const;
+  inline Vectorized<float> eq(const Vectorized<float>& other) const;
+  inline Vectorized<float> ne(const Vectorized<float>& other) const;
+  inline Vectorized<float> gt(const Vectorized<float>& other) const;
+  inline Vectorized<float> ge(const Vectorized<float>& other) const;
+  inline Vectorized<float> lt(const Vectorized<float>& other) const;
+  inline Vectorized<float> le(const Vectorized<float>& other) const;
 };

 template <>
-Vectorized<float> inline operator+(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
+inline Vectorized<float> operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
  return svadd_f32_x(ptrue, a, b);
 }

 template <>
-Vectorized<float> inline operator-(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
+inline Vectorized<float> operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
  return svsub_f32_x(ptrue, a, b);
 }

 template <>
-Vectorized<float> inline operator*(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
+inline Vectorized<float> operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
  return svmul_f32_x(ptrue, a, b);
 }

 template <>
-Vectorized<float> inline operator/(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
+inline Vectorized<float> operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
  return svdiv_f32_x(ptrue, a, b);
 }

 // frac. Implement this here so we can use subtraction
-Vectorized<float> inline Vectorized<float>::frac() const {
+inline Vectorized<float> Vectorized<float>::frac() const {
  return *this - this->trunc();
 }

@ -585,115 +543,91 @@ Vectorized<float> inline maximum(
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
-Vectorized<float> inline minimum(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
+inline Vectorized<float> minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
  return svmin_f32_x(ptrue, a, b);
 }

 template <>
-Vectorized<float> inline clamp(
-    const Vectorized<float>& a,
-    const Vectorized<float>& min,
-    const Vectorized<float>& max) {
+inline Vectorized<float> clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
  return svmin_f32_x(ptrue, max, svmax_f32_x(ptrue, min, a));
 }

 template <>
-Vectorized<float> inline clamp_max(
-    const Vectorized<float>& a,
-    const Vectorized<float>& max) {
+inline Vectorized<float> clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
  return svmin_f32_x(ptrue, max, a);
 }

 template <>
-Vectorized<float> inline clamp_min(
-    const Vectorized<float>& a,
-    const Vectorized<float>& min) {
+inline Vectorized<float> clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
  return svmax_f32_x(ptrue, min, a);
 }

 template <>
-Vectorized<float> inline operator&(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
-  return svreinterpret_f32_s32(
-      svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+inline Vectorized<float> operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
 }

 template <>
-Vectorized<float> inline operator|(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
-  return svreinterpret_f32_s32(
-      svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+inline Vectorized<float> operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
 }

 template <>
-Vectorized<float> inline operator^(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
-  return svreinterpret_f32_s32(
-      sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+inline Vectorized<float> operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
 }

-Vectorized<float> inline Vectorized<float>::eq(
-    const Vectorized<float>& other) const {
+inline Vectorized<float> Vectorized<float>::eq(const Vectorized<float>& other) const {
  return (*this == other) & Vectorized<float>(1.0f);
 }

-Vectorized<float> inline Vectorized<float>::ne(
-    const Vectorized<float>& other) const {
+inline Vectorized<float> Vectorized<float>::ne(const Vectorized<float>& other) const {
  return (*this != other) & Vectorized<float>(1.0f);
 }

-Vectorized<float> inline Vectorized<float>::gt(
-    const Vectorized<float>& other) const {
+inline Vectorized<float> Vectorized<float>::gt(const Vectorized<float>& other) const {
  return (*this > other) & Vectorized<float>(1.0f);
 }

-Vectorized<float> inline Vectorized<float>::ge(
-    const Vectorized<float>& other) const {
+inline Vectorized<float> Vectorized<float>::ge(const Vectorized<float>& other) const {
  return (*this >= other) & Vectorized<float>(1.0f);
 }

-Vectorized<float> inline Vectorized<float>::lt(
-    const Vectorized<float>& other) const {
+inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) const {
  return (*this < other) & Vectorized<float>(1.0f);
 }

-Vectorized<float> inline Vectorized<float>::le(
-    const Vectorized<float>& other) const {
+inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
  return (*this <= other) & Vectorized<float>(1.0f);
 }

 template <>
 inline void convert(const float* src, float* dst, int64_t n) {
-  const int64_t fraction = n % Vectorized<float>::size();
+  const int64_t fraction = n % svcntw();
 #pragma unroll
-  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+  for (int64_t i = 0; i < n - fraction; i += svcntw()) {
    svst1_f32(ptrue, dst + i, svldnt1_f32(ptrue, src + i));
  }
 #pragma unroll
-  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+  for (int64_t i = n - fraction; i < n; i += svcntw()) {
    svbool_t pg = svwhilelt_b32(i, n);
    svst1_f32(pg, dst + i, svldnt1_f32(pg, src + i));
  }
 }

 template <>
-inline void convert(const float* src, at::Half* dst, int64_t n) {
-  const int64_t fraction = n % Vectorized<float>::size();
-  svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
-  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+inline void convert(const float *src, at::Half *dst, int64_t n) {
+  const int64_t fraction = n % svcntw();
+  svbool_t pg_16 = svwhilelt_b16(0ull, svcntw());
+  svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
 #pragma unroll
-  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
-    svfloat16_t src_vec = svuzp1_f16(
-        svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16);
+  for (int64_t i = 0; i < n - fraction; i += svcntw()) {
+    svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)),
+                                    ZERO_F16);
    svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
  }
 #pragma unroll
-  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+  for (int64_t i = n - fraction; i < n; i += svcntw()) {
    pg_16 = svwhilelt_b16(i, n);
    pg_32 = svwhilelt_b32(i, n);
    svfloat16_t src_vec = svuzp1_f16(
@ -703,19 +637,18 @@ inline void convert(const float* src, at::Half* dst, int64_t n) {
 }

 template <>
-inline void convert(const at::Half* src, float* dst, int64_t n) {
-  const int64_t fraction = n % Vectorized<float>::size();
-  svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
-  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+inline void convert(const at::Half *src, float *dst, int64_t n) {
+  const int64_t fraction = n % svcntw();
+  svbool_t pg_16 = svwhilelt_b16(0ull, svcntw());
+  svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
 #pragma unroll
-  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
-    svfloat16_t src_vec = svzip1_f16(
-        svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
-        ZERO_F16);
+  for (int64_t i = 0; i < n - fraction; i += svcntw()) {
+    svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
+                                    ZERO_F16);
    svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
  }
 #pragma unroll
-  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+  for (int64_t i =  n - fraction; i < n; i += svcntw()) {
    pg_16 = svwhilelt_b16(i, n);
    pg_32 = svwhilelt_b32(i, n);
    svfloat16_t src_vec = svzip1_f16(
@ -726,20 +659,19 @@ inline void convert(const at::Half* src, float* dst, int64_t n) {
 }

 template <>
-inline void convert(const bool* src, float* dst, int64_t n) {
-  const int64_t fraction = n % Vectorized<float>::size();
-  svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<float>::size());
-  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+inline void convert(const bool *src, float *dst, int64_t n) {
+  const int64_t fraction = n % svcntw();
+  svbool_t pg_8 = svwhilelt_b8(0ull, svcntw());
+  svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
 #pragma unroll
-  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
-    svuint8_t src_vec_u8 =
-        svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+  for (int64_t i = 0; i < n - fraction; i += svcntw()) {
+    svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
    svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
    svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
    svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
  }
 #pragma unroll
-  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+  for (int64_t i = n - fraction; i < n; i += svcntw()) {
    pg_8 = svwhilelt_b8(i, n);
    pg_32 = svwhilelt_b32(i, n);
    svuint8_t src_vec_u8 =
@ -751,10 +683,7 @@ inline void convert(const bool* src, float* dst, int64_t n) {
 }

 template <>
-Vectorized<float> inline fmadd(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b,
-    const Vectorized<float>& c) {
+inline Vectorized<float> fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
  return svmad_f32_x(ptrue, a, b, c);
 }

@ -785,4 +714,4 @@ Vectorized<float> inline fnmsub(
 #endif // defined(CPU_CAPABILITY_SVE)

 } // namespace CPU_CAPABILITY
-} // namespace at::vec
+} // namespace at::vec
--- a/aten/src/ATen/cpu/vec/sve/vec_int.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_int.h
@ -15,7 +15,7 @@ namespace at::vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)

 #define VEC_INT_SVE_TEMPLATE(vl, bit)                                         \
  template <>                                                                 \
@ -49,10 +49,11 @@ inline namespace CPU_CAPABILITY {
    operator svint##bit##_t() const {                                         \
      return values;                                                          \
    }                                                                         \
-    template <uint64_t mask>                                                  \
    static Vectorized<int##bit##_t> blend(                                    \
        const Vectorized<int##bit##_t>& a,                                    \
-        const Vectorized<int##bit##_t>& b) {                                  \
+        const Vectorized<int##bit##_t>& b, \
+        uint64_t mask                      \
+      ) {                                  \
      __at_align__ int##bit##_t flag_arr[size()];                             \
      for (int i = 0; i < size(); ++i) {                                      \
        flag_arr[i] = (i < 64 && (mask & (1ULL << i))) ? 1 : 0;               \
@ -493,7 +494,7 @@ Vectorized<int8_t> inline operator>>(
  return svasr_s8_x(ptrue, a, svreinterpret_u8_s8(b));
 }

-#endif // defined(CPU_CAPABILITY_SVE)
+#endif // defined(CPU_CAPABILITY_SVE256)

 } // namespace CPU_CAPABILITY
 } // namespace at::vec
--- a/aten/src/ATen/cpu/vec/sve/vec_qint.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_qint.h
@ -46,7 +46,7 @@ namespace at::vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)

 // NOTE: These are low-performance implementations that we fall back on
 // if we are not building with SVE. This may not be an issue, because
@ -100,12 +100,12 @@ struct VectorizedQuantizedConverter {
      Vectorized<float> zero_point,
      Vectorized<float> scale_zp_premul) const {
    float_vec_return_type rv;
-    float tmp_scale[Vectorized<float>::size()];
-    float tmp_zero_point[Vectorized<float>::size()];
+    float * tmp_scale = new float[Vectorized<float>::size()];
+    float * tmp_zero_point = new float[Vectorized<float>::size()];
    scale.store(tmp_scale);
    zero_point.store(tmp_zero_point);
    for (int i = 0; i < float_num_vecs(); ++i) {
-      float tmp_vals[Vectorized<float>::size()];
+      float * tmp_vals = new float[Vectorized<float>::size()];
      for (int j = 0; j < Vectorized<float>::size(); ++j) {
        tmp_vals[j] = at::native::dequantize_val<T>(
            tmp_scale[j],
@ -113,6 +113,10 @@ struct VectorizedQuantizedConverter {
            T(vals[Vectorized<float>::size() * i + j]));
      }
      rv[i] = Vectorized<float>::loadu(tmp_vals);
+
+      delete[] tmp_scale;
+      delete[] tmp_zero_point;
+      delete[] tmp_vals;
    }
    return rv;
  }
@ -121,12 +125,12 @@ struct VectorizedQuantizedConverter {
      Vectorized<float> scale,
      Vectorized<float> zero_point) const {
    float_vec_return_type rv;
-    float tmp_scale[Vectorized<float>::size()];
-    float tmp_zero_point[Vectorized<float>::size()];
+    float * tmp_scale = new float[Vectorized<float>::size()];
+    float * tmp_zero_point = new float[Vectorized<float>::size()];
    scale.store(tmp_scale);
    zero_point.store(tmp_zero_point);
    for (int i = 0; i < float_num_vecs(); ++i) {
-      float tmp_vals[Vectorized<float>::size()];
+      float * tmp_vals = new float[Vectorized<float>::size()];
      for (int j = 0; j < Vectorized<float>::size(); ++j) {
        tmp_vals[j] = at::native::dequantize_val<T>(
            tmp_scale[j],
@ -134,6 +138,9 @@ struct VectorizedQuantizedConverter {
            T(vals[Vectorized<float>::size() * i + j]));
      }
      rv[i] = Vectorized<float>::loadu(tmp_vals);
+      delete[] tmp_scale;
+      delete[] tmp_zero_point;
+      delete[] tmp_vals;
    }
    return rv;
  }
@ -205,7 +212,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
      int32_t zero_point,
      float inverse_scale) {
    std::array<value_type, size()> qvals;
-    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+    float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];

    for (int i = 0; i < float_num_vecs(); ++i) {
      rhs[i].store(
@ -216,10 +223,11 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
    at::native::quantize_vec<c10::qint32, /*precision=*/32>(
        scale,
        zero_point,
-        float_vals.data(),
+        float_vals,
        (c10::qint32*)qvals.data(),
        Vectorized<float>::size() * float_num_vecs());

+    delete[] float_vals;
    return Vectorized<c10::qint32>::loadu(qvals.data());
  }

@ -359,7 +367,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
      int32_t zero_point,
      float inverse_scale) {
    std::array<value_type, size()> qvals;
-    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+    float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];

    for (int i = 0; i < float_num_vecs(); ++i) {
      rhs[i].store(
@ -370,10 +378,11 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
    at::native::quantize_vec<c10::qint8>(
        scale,
        zero_point,
-        float_vals.data(),
+        float_vals,
        (c10::qint8*)qvals.data(),
        Vectorized<float>::size() * float_num_vecs());

+    delete[] float_vals;
    return Vectorized<c10::qint8>::loadu(qvals.data());
  }

@ -511,7 +520,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
      int32_t zero_point,
      float inverse_scale) {
    std::array<value_type, size()> qvals;
-    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+    float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];

    for (int i = 0; i < float_num_vecs(); ++i) {
      rhs[i].store(
@ -522,10 +531,11 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
    at::native::quantize_vec<c10::quint8>(
        scale,
        zero_point,
-        float_vals.data(),
+        float_vals,
        (c10::quint8*)qvals.data(),
        Vectorized<float>::size() * float_num_vecs());

+    delete[] float_vals;
    return Vectorized<c10::quint8>::loadu(qvals.data());
  }

@ -600,7 +610,7 @@ Vectorized<c10::quint8> inline maximum(
  return a.maximum(b);
 }

-#endif // defined(CPU_CAPABILITY_SVE)
+#endif // defined(CPU_CAPABILITY_SVE256)

 } // namespace CPU_CAPABILITY
 } // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec128/vec128.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128.h
@ -4,7 +4,9 @@
 #include <ATen/cpu/vec/intrinsics.h>

 #ifdef __aarch64__
-#if !defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
+#include <ATen/cpu/vec/sve/vec_common_sve.h>
+#else
 #include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_float_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_half_neon.h>
--- a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
@ -241,7 +241,7 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
  Vectorized() = default;

  Vectorized(c10::BFloat16 val)
-      : Vectorized16(at_vdupq_n_bf16(c10::bit_cast<at_bfloat16_t>(val.x))) {}
+      : Vectorized16(at_vdupq_n_bf16(val.x)) {}
  Vectorized(float val) : Vectorized(c10::BFloat16(val)) {}
  Vectorized(
      value_type val0,
@ -253,14 +253,14 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
      value_type val6,
      value_type val7)
      : Vectorized16(at_bfloat16x8_t{
-            c10::bit_cast<at_bfloat16_t>(val0.x),
-            c10::bit_cast<at_bfloat16_t>(val1.x),
-            c10::bit_cast<at_bfloat16_t>(val2.x),
-            c10::bit_cast<at_bfloat16_t>(val3.x),
-            c10::bit_cast<at_bfloat16_t>(val4.x),
-            c10::bit_cast<at_bfloat16_t>(val5.x),
-            c10::bit_cast<at_bfloat16_t>(val6.x),
-            c10::bit_cast<at_bfloat16_t>(val7.x)}) {}
+            val0.x,
+            val1.x,
+            val2.x,
+            val3.x,
+            val4.x,
+            val5.x,
+            val6.x,
+            val7.x}) {}

  static Vectorized<c10::BFloat16> blendv(
      const Vectorized<c10::BFloat16>& a,
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@ -4,7 +4,7 @@

 namespace at::vec {
 inline namespace CPU_CAPABILITY {
-#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
+#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE))
 template <typename src_t>
 struct VecConvert<
    float,
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@ -41,32 +41,16 @@ inline namespace CPU_CAPABILITY {
 #define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
 #endif

-template <int index, bool mask_val>
+template <int index>
 struct BlendRegs {
  static float32x4_t impl(
      const float32x4_t& a,
      const float32x4_t& b,
-      float32x4_t& res);
-};
-
-template <int index>
-struct BlendRegs<index, true> {
-  static float32x4_t impl(
-      const float32x4_t& a,
-      const float32x4_t& b,
-      float32x4_t& res) {
-    return vsetq_lane_f32(vgetq_lane_f32(b, index), res, index);
-  }
-};
-
-template <int index>
-struct BlendRegs<index, false> {
-  static float32x4_t impl(
-      const float32x4_t& a,
-      const float32x4_t& b,
-      float32x4_t& res) {
-    return vsetq_lane_f32(vgetq_lane_f32(a, index), res, index);
-  }
+      float32x4_t& res,
+      bool mask_val
+    ) {
+      return vsetq_lane_f32(vgetq_lane_f32(mask_val ? b : a, index), res, index);
+    }
 };

 template <>
@ -94,19 +78,15 @@ class Vectorized<float> {
  operator float32x4_t() const {
    return values;
  }
-  template <int64_t mask>
  static Vectorized<float> blend(
      const Vectorized<float>& a,
-      const Vectorized<float>& b) {
+      const Vectorized<float>& b,
+      int64_t mask) {
    Vectorized<float> vec;
-    vec.values = BlendRegs < 0,
-    (mask & 0x01) != 0 > ::impl(a.values, b.values, vec.values);
-    vec.values = BlendRegs < 1,
-    (mask & 0x02) != 0 > ::impl(a.values, b.values, vec.values);
-    vec.values = BlendRegs < 2,
-    (mask & 0x04) != 0 > ::impl(a.values, b.values, vec.values);
-    vec.values = BlendRegs < 3,
-    (mask & 0x08) != 0 > ::impl(a.values, b.values, vec.values);
+    vec.values = BlendRegs <0>::impl(a.values, b.values, vec.values, (mask & 0x01) != 0);
+    vec.values = BlendRegs <1> ::impl(a.values, b.values, vec.values, (mask & 0x02) != 0);
+    vec.values = BlendRegs <2> ::impl(a.values, b.values, vec.values, (mask & 0x04) != 0);
+    vec.values = BlendRegs <3> ::impl(a.values, b.values, vec.values, (mask & 0x08) != 0);
    return vec;
  }
  static Vectorized<float> blendv(
@ -307,11 +287,50 @@ class Vectorized<float> {
  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp)
  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp2)
  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
+  // Implementation copied from Arm Optimized Routine https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
  Vectorized<float> exp_u20() const {
-    return exp();
+    // bail out to sleef if it's a special case:
+    // i.e. there's an input s.t. |input| > 87.3....
+    const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
+    uint32x4_t cmp = vcagtq_f32 (values, special_bound);
+    if (vpaddd_u64 (vreinterpretq_u64_u32 (cmp)) != 0) {
+      return exp();
+    }
+
+    const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f);
+    const float ln2_hi = 0x1.62e4p-1f; 
+    const float ln2_lo = 0x1.7f7d1cp-20f; 
+    const float c0 = 0x1.0e4020p-7f; 
+    const float c2 = 0x1.555e66p-3f;
+    const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2};
+
+    const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000);
+    const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f);
+    const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f);
+    const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f);
+
+    /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+    
+    float32x4_t n = vrndaq_f32 (vmulq_f32 (values, inv_ln2));
+    float32x4_t r = vfmsq_laneq_f32 (values, n, ln2_c02, 0);
+    r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+    uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+    float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, exponent_bias));
+
+    float32x4_t r2 = vmulq_f32 (r, r);
+    float32x4_t p = vfmaq_laneq_f32 (c1, r, ln2_c02, 2);
+    float32x4_t q = vfmaq_laneq_f32 (c3, r, ln2_c02, 3);
+    q = vfmaq_f32 (q, p, r2);
+    p = vmulq_f32 (c4, r);
+    float32x4_t poly = vfmaq_f32 (p, q, r2);
+  
+    return vfmaq_f32 (scale, poly, scale);
+      
  }
+  
  Vectorized<float> fexp_u20() const {
-    return exp();
+    return exp_u20();
  }
  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
      fmod,
@ -645,4 +664,4 @@ inline Vectorized<float> Vectorized<float>::erf() const {
 #endif /* defined(aarch64) */

 } // namespace CPU_CAPABILITY
-} // namespace at::vec
+} // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h
@ -813,11 +813,12 @@ static inline Vectorized<T> binary_op_as_fp32(
 #define LOAD_FP32_NON_VECTORIZED_INIT(type, name)                           \
  inline void load_fp32_from_##name(                                        \
      const type* data, Vectorized<float>& out) {                           \
-    __at_align__ float values[Vectorized<float>::size()];                   \
+    __at_align__ float * values = new float[Vectorized<float>::size()];                   \
    for (const auto k : c10::irange(Vectorized<float>::size())) {           \
      values[k] = data[k];                                                  \
    }                                                                       \
    out = Vectorized<float>::loadu(values);                                 \
+    delete[] values; \
  }                                                                         \
                                                                            \
  inline void load_fp32_from_##name(                                        \
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@ -269,12 +269,13 @@ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)
 #else // defined(CPU_CAPABILITY_AVX2)

 #if !(                                                                      \
-    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    !defined(CPU_CAPABILITY_SVE256))
+    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__))
 CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
 #endif

+#if !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
 LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
+#endif
 #endif // defined(CPU_CAPABILITY_AVX2)
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@ -294,7 +294,7 @@ struct VecConvert<
 };
 #endif

-#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
+#if (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)) && defined(__ARM_FEATURE_BF16)

 template <>
 struct VecConvert<float, 1, BFloat16, 1> {
--- a/aten/src/ATen/cpu/vec/vec256/vec256_half.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_half.h
@ -270,7 +270,7 @@ LOAD_FP32_VECTORIZED_INIT(Half, fp16)

 #if !(                                                                      \
    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    !defined(CPU_CAPABILITY_SVE256))
+    !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE))
 CONVERT_NON_VECTORIZED_INIT(Half, half)
 #endif

--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@ -915,7 +915,7 @@ Vectorized<c10::quint8> inline maximum(
  return a.maximum(b);
 }

-#elif !defined(CPU_CAPABILITY_SVE256)
+#elif !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)

 // NOTE: These are low-performance implementations that we fall back on
 // if we are not building with AVX2. This may not be an issue, because
@ -1374,11 +1374,11 @@ Vectorized<c10::quint8> inline maximum(

 #endif // if defined(CPU_CAPABILITY_AVX2)

-#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
-std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
-    at::vec::Vectorized<int8_t> src) {
-  auto s8x8 = vld1_s8(src.operator const int8_t*());
-  auto s16x8 = vmovl_s8(s8x8);
+#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
+std::pair<Vectorized<float>, Vectorized<float>>
+inline convert_int8_to_float(at::vec::Vectorized<int8_t> src) {
+    auto s8x8 = vld1_s8(src.operator const int8_t*());
+    auto s16x8 = vmovl_s8(s8x8);

  auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
  auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -68,7 +68,7 @@ Windows llvm will not have this definition.
 #define VECTOR_WIDTH 64
 #define int_vector __m512i
 #elif defined(__aarch64__) && \
-    !defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
+    !defined(CPU_CAPABILITY_SVE) && !defined(CPU_CAPABILITY_SVE256) // CPU_CAPABILITY_AVX512
 // SVE code expects 256-vectors; leave that set for SVE?
 #if defined(__GNUC__)
 #define __at_align__ __attribute__((aligned(16)))
@ -79,6 +79,18 @@ Windows llvm will not have this definition.
 #endif
 #define VECTOR_WIDTH 16
 #else // CPU_CAPABILITY_AVX512
+#if defined(CPU_CAPABILITY_SVE)
+#if defined(__GNUC__)
+#define __at_align__ __attribute__((aligned(16)))
+#elif defined(_WIN32)
+#define __at_align__ __declspec(align(16))
+#else
+#define __at_align__
+#endif
+#define VECTOR_WIDTH 16
+#define int_vector __m256i
+#else // CPU_CAPABILITY_SVE256 || CPU_CAPABILITY_SVE
+#if defined(CPU_CAPABILITY_SVE256)
 #if defined(__GNUC__)
 #define __at_align__ __attribute__((aligned(32)))
 #elif defined(_WIN32)
@ -88,6 +100,18 @@ Windows llvm will not have this definition.
 #endif
 #define VECTOR_WIDTH 32
 #define int_vector __m256i
+#else // CPU_CAPABILITY_SVE
+#if defined(__GNUC__)
+#define __at_align__ __attribute__((aligned(16)))
+#elif defined(_WIN32)
+#define __at_align__ __declspec(align(16))
+#else
+#define __at_align__
+#endif
+#define VECTOR_WIDTH 16
+#define int_vector __m256i
+#endif // CPU_CAPABILITY_SVE256
+#endif // CPU_CAPABILITY_SVE256 || CPU_CAPABILITY_SVE
 #endif // CPU_CAPABILITY_AVX512

 namespace at::vec {
@ -210,8 +234,7 @@ struct Vectorized {
  auto as_bytes() const -> const char* {
    return reinterpret_cast<const char*>(values);
  }
-  template <int64_t mask_>
-  static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+  static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b, const int64_t mask_) {
    int64_t mask = mask_;
    Vectorized vector;
    for (const auto i : c10::irange(size())) {
@ -1312,7 +1335,7 @@ std::
        T const* base_addr,
        const Vectorized<int_same_size_t<T>>& vindex,
        Vectorized<T>& mask) {
-  static constexpr int size = Vectorized<T>::size();
+  static const int size = Vectorized<T>::size();
  T src_arr[size];
  int_same_size_t<T> mask_arr[size]; // use int type so we can logical and
  int_same_size_t<T> index_arr[size];
@ -1405,7 +1428,7 @@ inline Vectorized<T> convert_to_fp_of_same_size(
 // clang-format on
 template <typename T>
 inline std::enable_if_t<
-    Vectorized<T>::size() % 2 == 0,
+    true,
    std::pair<Vectorized<T>, Vectorized<T>>>
 deinterleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
  static constexpr int size = Vectorized<T>::size();
@ -1444,7 +1467,7 @@ VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(deinterleave2)
 // clang-format on
 template <typename T>
 inline std::enable_if_t<
-    Vectorized<T>::size() % 2 == 0,
+    true,
    std::pair<Vectorized<T>, Vectorized<T>>>
 interleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
  static constexpr int size = Vectorized<T>::size();
@ -1486,7 +1509,7 @@ inline void convert(const src_T* src, dst_T* dst, int64_t n) {

 template <typename T>
 inline Vectorized<T> flip(const Vectorized<T>& data) {
-  static constexpr int size = Vectorized<T>::size();
+  static const int size = Vectorized<T>::size();
  T output[size];
  T buffer[size];
  data.store(static_cast<void*>(buffer));
--- a/aten/src/ATen/cpu/vec/vec_convert.h
+++ b/aten/src/ATen/cpu/vec/vec_convert.h
@ -15,7 +15,7 @@ template <
 struct VecConvert {
  static inline VectorizedN<dst_t, dst_n> apply(
      const VectorizedN<src_t, src_n>& src) {
-    constexpr int count = std::min(
+    const int count = std::min(
        VectorizedN<src_t, src_n>::size(), VectorizedN<dst_t, dst_n>::size());
    __at_align__ src_t src_buf[VectorizedN<src_t, src_n>::size()];
    src.store(src_buf);
--- a/aten/src/ATen/cpu/vec/vec_mask.h
+++ b/aten/src/ATen/cpu/vec/vec_mask.h
@ -2,6 +2,8 @@

 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec_n.h>
+
+#include <cassert>
 namespace at::vec {
 inline namespace CPU_CAPABILITY {

@ -38,9 +40,9 @@ struct VecMaskLoad {
  static inline VectorizedN<data_t, data_n> apply(
      const data_t* ptr,
      const VecMask<mask_t, mask_n>& vec_mask) {
-    constexpr typename VecMask<mask_t, mask_n>::size_type size =
+    const typename VecMask<mask_t, mask_n>::size_type size =
        VecMask<mask_t, mask_n>::size();
-    static_assert(VectorizedN<data_t, data_n>::size() >= size);
+    assert((VectorizedN<data_t, data_n>::size() >= size));
    __at_align__ data_t data[size];
    __at_align__ mask_t mask[size];
    auto mask_ = VectorizedN<mask_t, mask_n>(vec_mask);
@ -134,7 +136,7 @@ class VecMask {
  template <typename U, int L>
  static VecMask<T, N> from(const VectorizedN<U, L>& b_vec) {
    __at_align__ U b_buf[size()];
-    if constexpr (size() >= VectorizedN<U, L>::size()) {
+    if (size() >= VectorizedN<U, L>::size()) {
      b_vec.store(b_buf);
      for (int i = VectorizedN<U, L>::size(); i < size(); i++) {
        b_buf[i] = static_cast<U>(0);
@ -235,16 +237,18 @@ class VecMask {
  template <
      typename U,
      int L,
-      std::enable_if_t<L >= 2 && VectorizedN<U, L>::size() >= size(), int> = 0>
+      std::enable_if_t<L >= 2, int> = 0>
  VectorizedN<U, L> loadu(const U* ptr) const {
+    assert((VectorizedN<U, L>::size() >= size()));
    return VecMaskLoad<U, L, T, N>::apply(ptr, *this);
  }

  template <
      typename U,
      int L,
-      std::enable_if_t<L == 1 && Vectorized<U>::size() >= size(), int> = 0>
+      std::enable_if_t<L == 1, int> = 0>
  Vectorized<U> loadu(const U* ptr) const {
+    assert((Vectorized<U>::size() >= size()));
    return VecMaskLoad<U, L, T, N>::apply(ptr, *this);
  }
 };
--- a/aten/src/ATen/cpu/vec/vec_n.h
+++ b/aten/src/ATen/cpu/vec/vec_n.h
@ -28,7 +28,7 @@ class VectorizedN {
  using size_type = int;

  static constexpr size_type size_T = sizeof(T);
-  static constexpr size_type size() {
+  static size_type size() {
    return Vectorized<T>::size() * N;
  }

--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -832,7 +832,7 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
      bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
    }
  }
-#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
+#if defined(USE_ROCM) && !defined(_MSC_VER)
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
    at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
  }
@ -1273,7 +1273,7 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
    gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
 #endif
  }
-#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
+#if defined(USE_ROCM) && !defined(_MSC_VER)
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
    at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
  }
@ -1289,7 +1289,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
    gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
  }
-#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
+#if defined(USE_ROCM) && !defined(_MSC_VER)
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
    if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
      gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
@ -1341,7 +1341,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
    gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
  }
-#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
+#if defined(USE_ROCM) && !defined(_MSC_VER)
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
    at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
  }
@ -1357,7 +1357,7 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
    gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
  }
-#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
+#if defined(USE_ROCM) && !defined(_MSC_VER)
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
    at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
  }
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -2,6 +2,7 @@
 #include <ATen/cuda/CUDAGraph.h>
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/Functions.h>
+#include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAFunctions.h>

 #include <cstddef>
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@ -2,7 +2,6 @@

 #include <ATen/Tensor.h>
 #include <c10/core/Device.h>
-#include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/flat_hash_map.h>
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -207,27 +207,6 @@ bool CUDAHooks::hasCuBLASLt() const {
 #endif
 }

-
-bool CUDAHooks::hasCKSDPA() const {
-#if !defined(USE_ROCM)
-    return false;
-#elif defined(USE_ROCM) && defined(USE_ROCM_CK_SDPA)
-    return true;
-#else
-    return false;
-#endif
-}
-
-bool CUDAHooks::hasCKGEMM() const {
-#if !defined(USE_ROCM)
-    return false;
-#elif defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
-    return true;
-#else
-    return false;
-#endif
-}
-
 bool CUDAHooks::hasROCM() const {
  // Currently, this is same as `compiledWithMIOpen`.
  // But in future if there are ROCm builds without MIOpen,
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -31,8 +31,6 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  bool hasCuSOLVER() const override;
  bool hasCuBLASLt() const override;
  bool hasROCM() const override;
-  bool hasCKSDPA() const override;
-  bool hasCKGEMM() const override;
  const at::cuda::NVRTC& nvrtc() const override;
  DeviceIndex current_device() const override;
  bool isBuilt() const override {return true;}
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -118,14 +118,6 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    return false;
  }

-  virtual bool hasCKSDPA() const {
-    return false;
-  }
-
-  virtual bool hasCKGEMM() const {
-    return false;
-  }
-
  virtual const at::cuda::NVRTC& nvrtc() const {
    TORCH_CHECK(false, "NVRTC requires CUDA. ", CUDA_HELP);
  }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ryo Suzuki	3411990fa0	Fix compile	2025-08-19 12:16:29 +00:00
Fadi Arafeh	fc32f3d5eb	Update aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h	2025-08-18 14:43:19 +01:00
Fadi Arafeh	ef8f493676	Update aten/src/ATen/cpu/vec/sve/vec_float.h	2025-08-18 13:14:40 +01:00
Fadi Arafeh	92eaa3d3b8	Update aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h	2025-08-18 13:11:16 +01:00
Analle Abuammar	e0340e599e	[feat]: Add optimization for SVE exp function Signed-off-by: Analle Abuammar <analle.abuammar@arm.com> Co-authored-by: Fadi Arafeh <Fadi.Arafeh@arm.com>	2025-08-15 15:47:00 +00:00
Ryo Suzuki	c3e4e4079e	Fix tests	2025-08-15 15:15:48 +00:00
Ryo Suzuki	62f61292e3	add SVE dispatch	2025-08-15 15:15:36 +00:00
Ryo Suzuki	41cbceee59	Make size non-constexpr	2025-08-15 15:15:24 +00:00
Ryo Suzuki	46706e7c34	Vec length agnostic SVE Vectorized class POC	2025-08-15 15:15:05 +00:00