add vec128 vecreduce

fix ci issues
fix double free
2025-10-26 16:44:54 +08:00 · 2025-09-23 17:16:40 +00:00 · 2025-09-12 09:32:19 +00:00 · 2025-09-12 09:32:19 +00:00 · 2025-09-12 09:32:18 +00:00 · 2025-09-12 09:30:57 +00:00
950 changed files with 12331 additions and 35094 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -31,7 +31,8 @@ pip install -r /pytorch/requirements.txt
 pip install auditwheel==6.2.0 wheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
-    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
+    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
    export USE_SYSTEM_NCCL=1
@ -45,5 +46,6 @@ else
        export USE_NVIDIA_PYPI_LIBS=1
    fi

-    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
+    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -317,7 +317,7 @@ if __name__ == "__main__":
    ).decode()

    print("Building PyTorch wheel")
-    build_vars = ""
+    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
    if enable_cuda:
        build_vars += "MAX_JOBS=5 "
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
        try:
            with socket.create_connection((addr, port), timeout=timeout):
                return
-        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
+        except (ConnectionRefusedError, socket.timeout):  # noqa: PERF203
            if i == attempt_cnt - 1:
                raise
            time.sleep(timeout)
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -214,7 +214,8 @@ case "$tag" in
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
-    ANACONDA_PYTHON_VERSION=3.10
+    # TODO (huydhn): Upgrade this to Python >= 3.10
+    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    VISION=yes
    KATEX=yes
@ -262,10 +263,13 @@ case "$tag" in
    TRITON_CPU=yes
    ;;
  pytorch-linux-jammy-linter)
-    PYTHON_VERSION=3.10
+    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
+    # We will need to update mypy version eventually, but that's for another day. The task
+    # would be to upgrade mypy to 1.0.0 with Python 3.11
+    PYTHON_VERSION=3.9
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
-    PYTHON_VERSION=3.10
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
+    PYTHON_VERSION=3.9
    CUDA_VERSION=12.8.1
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-e0dda9059d082537cee36be6c5e4fe3b18c880c0
+56392aa978594cc155fa8af48cd949f5b5f1823a
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@ -1,2 +1,2 @@
-transformers==4.56.0
+transformers==4.54.0
 soxr==0.5.0
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-bbb06c0334a6772b92d24bde54956e675c8c6604
+70cbcaca84471df49e81ddc56873c9241b671f8d
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -42,27 +42,22 @@ install_pip_dependencies() {
  # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
  # numba and scipy version used in PyTorch CI
  conda_run pip uninstall -y numba scipy
-  # Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
-  pip_install yaspin==3.1.0

  popd
 }

 setup_executorch() {
+  pushd executorch
+
  export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

  as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
+  popd
 }

-if [ $# -eq 0 ]; then
-  clone_executorch
-  install_buck2
-  install_conda_dependencies
-  install_pip_dependencies
-  pushd executorch
-  setup_executorch
-  popd
-else
-  "$@"
-fi
+clone_executorch
+install_buck2
+install_conda_dependencies
+install_pip_dependencies
+setup_executorch
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -93,9 +93,8 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Pinned versions:
 #test that import:

-mypy==1.16.0 ; platform_system != "Windows"
+mypy==1.16.0
 # Pin MyPy version because new errors are likely to appear with each release
-# Skip on Windows as lots of type annotations are POSIX specific
 #Description: linter
 #Pinned versions: 1.16.0
 #test that import: test_typing.py, test_type_hints.py
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
--- a/.ci/libtorch/build.sh
+++ b/.ci/libtorch/build.sh
@ -7,4 +7,4 @@ set -ex

 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

-USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
+USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -41,6 +41,7 @@ def sample_vllm_test_library():
                "pytest -v -s basic_correctness/test_cumem.py",
                "pytest -v -s basic_correctness/test_basic_correctness.py",
                "pytest -v -s basic_correctness/test_cpu_offload.py",
+                "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
            ],
        },
        "vllm_basic_models_test": {
@ -67,12 +68,15 @@ def sample_vllm_test_library():
                        "-v",
                        "-s",
                        "entrypoints/llm",
+                        "--ignore=entrypoints/llm/test_lazy_outlines.py",
                        "--ignore=entrypoints/llm/test_generate.py",
+                        "--ignore=entrypoints/llm/test_generate_multiple_loras.py",
                        "--ignore=entrypoints/llm/test_collective_rpc.py",
                    ]
                ),
-                "pytest -v -s entrypoints/llm/test_generate.py",
-                "pytest -v -s entrypoints/offline_mode",
+                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
+                "pytest -v -s entrypoints/llm/test_generate.py ",
+                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
            ],
        },
        "vllm_regression_test": {
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@ -66,11 +66,6 @@ class VllmBuildParameters:
        "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
    )

-    # the cleaning script to remove torch dependencies from pip
-    cleaning_script: Path = env_path_field(
-        "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
-    )
-
    # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
    output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")

@ -165,7 +160,6 @@ class VllmBuildRunner(BaseRunner):
        logger.info("Running vllm build with inputs: %s", inputs)
        vllm_commit = clone_vllm()

-        self.cp_torch_cleaning_script(inputs)
        self.cp_dockerfile_if_exist(inputs)
        # cp torch wheels from root direct to vllm workspace if exist
        self.cp_torch_whls_if_exist(inputs)
@ -211,11 +205,6 @@ class VllmBuildRunner(BaseRunner):
        copy(inputs.torch_whls_path, tmp_dir)
        return tmp_dir

-    def cp_torch_cleaning_script(self, inputs: VllmBuildParameters):
-        script = get_path(inputs.cleaning_script, resolve=True)
-        vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
-        copy(script, vllm_script)
-
    def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):
        if not inputs.use_local_dockerfile:
            logger.info("using vllm default dockerfile.torch_nightly for build")
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@ -11,7 +11,7 @@ from typing import Any

 from cli.lib.common.cli_helper import BaseRunner
 from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env
-from cli.lib.common.path_helper import copy, get_path, remove_dir
+from cli.lib.common.path_helper import copy, remove_dir
 from cli.lib.common.pip_helper import (
    pip_install_first_match,
    pip_install_packages,
@ -43,10 +43,6 @@ class VllmTestParameters:

    torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")

-    cleaning_script: Path = env_path_field(
-        "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
-    )
-
    def __post_init__(self):
        if not self.torch_whls_path.exists():
            raise ValueError("missing torch_whls_path")
@ -96,13 +92,11 @@ class VllmTestRunner(BaseRunner):
        self._set_envs(params)

        clone_vllm(dst=self.work_directory)
-        self.cp_torch_cleaning_script(params)
        with working_directory(self.work_directory):
            remove_dir(Path("vllm"))
            self._install_wheels(params)
            self._install_dependencies()
        # verify the torches are not overridden by test dependencies
-
        check_versions()

    def run(self):
@ -131,11 +125,6 @@ class VllmTestRunner(BaseRunner):
            # double check the torches are not overridden by other packages
            check_versions()

-    def cp_torch_cleaning_script(self, params: VllmTestParameters):
-        script = get_path(params.cleaning_script, resolve=True)
-        vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
-        copy(script, vllm_script)
-
    def _install_wheels(self, params: VllmTestParameters):
        logger.info("Running vllm test with inputs: %s", params)
        if not pkg_exists("torch"):
--- a/.ci/pytorch/functorch_doc_push_script.sh
+++ b/.ci/pytorch/functorch_doc_push_script.sh
@ -0,0 +1,40 @@
+#!/bin/bash
+
+# This is where the local pytorch install in the docker image is located
+pt_checkout="/var/lib/jenkins/workspace"
+source "$pt_checkout/.ci/pytorch/common_utils.sh"
+echo "functorch_doc_push_script.sh: Invoked with $*"
+
+set -ex -o pipefail
+
+version=${DOCS_VERSION:-nightly}
+echo "version: $version"
+
+# Build functorch docs
+pushd $pt_checkout/functorch/docs
+make html
+popd
+
+git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
+pushd functorch_ghpages
+
+if [ "$version" == "main" ]; then
+  version=nightly
+fi
+
+git rm -rf "$version" || true
+mv "$pt_checkout/functorch/docs/build/html" "$version"
+
+git add "$version" || true
+git status
+git config user.email "soumith+bot@pytorch.org"
+git config user.name "pytorchbot"
+# If there aren't changes, don't make a commit; push is no-op
+git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
+git status
+
+if [[ "${WITH_PUSH:-}" == true ]]; then
+  git push -u origin gh-pages
+fi
+
+popd
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -35,10 +35,11 @@ fi

 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
+  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
-  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
-  # backends (specifically the gloo backend), so test that this case works too
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd

-python -mpip install -r requirements.txt
-
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1

-python -mpip install --no-input -r requirements.txt
-
 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
  # This environment variable makes ProcessGroupGloo default to
--- a/.ci/pytorch/numba-cuda-13.patch
+++ b/.ci/pytorch/numba-cuda-13.patch
@ -1,25 +0,0 @@
-From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001
-From: Michael Wang <13521008+isVoid@users.noreply.github.com>
-Date: Tue, 1 Apr 2025 17:28:05 -0700
-Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage
- (#185)
-
-Co-authored-by: isVoid <isVoid@users.noreply.github.com>
---
- numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
-index 1641bf77..233e9ed7 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
-+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
-@@ -365,6 +365,9 @@ def _find_api(self, fname):
-         else:
-             variants = ('_v2', '')
- 
-+        if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
-+            return getattr(self.lib, fname)
-+
-         for variant in variants:
-             try:
-                 return getattr(self.lib, f'{fname}{variant}')
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -32,16 +32,6 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
  git config --global --add safe.directory /var/lib/jenkins/workspace
 fi

-
-# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
-NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
-if [ -n "$NUMBA_CUDA_DIR" ]; then
-  NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
-  pushd "$NUMBA_CUDA_DIR"
-  patch -p4 <"$NUMBA_PATCH"
-  popd
-fi
-
 echo "Environment variables:"
 env

@ -334,17 +324,11 @@ test_python() {
 }

 test_python_smoke() {
-  # Smoke tests for H100/B200
+  # Smoke tests for H100
  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

-test_python_smoke_b200() {
-  # Targeted smoke tests for B200 - staged approach to avoid too many failures
-  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
-  assert_git_not_dirty
-}
-
 test_h100_distributed() {
  # Distributed tests at H100
  time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
@ -1556,10 +1540,14 @@ test_executorch() {
  install_torchvision
  install_torchaudio

-  INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
-
  pushd /executorch
-  "${INSTALL_SCRIPT}" setup_executorch
+
+  export PYTHON_EXECUTABLE=python
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+
+  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
+  # from the PR
+  bash .ci/scripts/setup-linux.sh --build-tool cmake

  echo "Run ExecuTorch unit tests"
  pytest -v -n auto
@ -1573,6 +1561,10 @@ test_executorch() {

  popd

+  # Test torchgen generated code for Executorch.
+  echo "Testing ExecuTorch op registration"
+  "$BUILD_BIN_DIR"/test_edge_op_registration
+
  assert_git_not_dirty
 }

@ -1580,7 +1572,6 @@ test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
        test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
-        distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Dynamo tests
@ -1730,6 +1721,11 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
  test_inductor_shard "${SHARD_NUMBER}"
+  if [[ "${SHARD_NUMBER}" == 1 ]]; then
+    if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then
+      test_inductor_distributed
+    fi
+  fi
 elif [[ "${TEST_CONFIG}" == *einops* ]]; then
  test_einops
 elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
@ -1779,8 +1775,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  test_xpu_bin
 elif [[ "${TEST_CONFIG}" == smoke ]]; then
  test_python_smoke
-elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
-  test_python_smoke_b200
 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
  test_h100_distributed
 elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -137,7 +137,7 @@ sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
  if "%BUILD_ENVIRONMENT%"=="" (
-    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash.
+    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
  ) else (
    copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"

--- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" (
 ) else (
  set CONDA_PARENT_DIR=C:\Jenkins
 )
-set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3
+

 :: Be conservative here when rolling out the new AMI with conda. This will try
 :: to install conda as before if it couldn't find the conda installation. This
 :: can be removed eventually after we gain enough confidence in the AMI
-if not exist %CONDA_ROOT_DIR% (
+if not exist %CONDA_PARENT_DIR%\Miniconda3 (
  set INSTALL_FRESH_CONDA=1
 )

@ -17,14 +17,10 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
  if errorlevel 1 exit /b
  if not errorlevel 0 exit /b

-  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
+  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
  if errorlevel 1 exit /b
  if not errorlevel 0 exit /b
 )

 :: Activate conda so that we can use its commands, i.e. conda, python, pip
-call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
-:: Activate conda so that we can use its commands, i.e. conda, python, pip
-call conda activate py_tmp
-
-call pip install -r .ci/docker/requirements-ci.txt
+call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@ -14,7 +14,7 @@ if not errorlevel 0 exit /b
 :: build\torch. Rather than changing all these references, making a copy of torch folder
 :: from conda to the current workspace is easier. The workspace will be cleaned up after
 :: the job anyway
-xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
+xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\

 pushd .
 if "%VC_VERSION%" == "" (
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,14 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 # TODO: Move both of them to Windows AMI
-python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
-
-# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
-# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
-# scipy from 1.6.3 to 1.10
-# expecttest from 0.1.3 to 0.3.0
-# xdoctest from 1.0.2 to 1.3.0
-python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"
+python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1

 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver==4.15.1.0
@ -59,6 +52,9 @@ python -m pip install parameterized==0.8.1
 # Install pulp for testing ilps under torch\distributed\_tools
 python -m pip install pulp==2.9.0

+# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
+python -m pip install expecttest==0.3.0
+
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -177,8 +177,7 @@ source ~/${desired_python}-build/bin/activate
 retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
 retry brew install libomp

-# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
-# is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

 export USE_MKLDNN=OFF
--- a/.flake8
+++ b/.flake8
@ -73,7 +73,7 @@ exclude =
    ./docs/src,
    ./functorch/docs,
    ./functorch/examples,
-    ./functorch/docs/source/tutorials,
+    ./functorch/notebooks,
    ./scripts,
    ./test/generated_type_hints_smoketest.py,
    ./third_party,
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -21,7 +21,6 @@ self-hosted-runner:
    - linux.arm64.2xlarge.ephemeral
    - linux.arm64.m7g.4xlarge
    - linux.arm64.m7g.4xlarge.ephemeral
-    - linux.arm64.r7g.12xlarge.memory
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
--- a/.github/actions/reuse-old-whl/reuse_old_whl.py
+++ b/.github/actions/reuse-old-whl/reuse_old_whl.py
@ -264,7 +264,7 @@ def unzip_artifact_and_replace_files() -> None:
        change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")

        for file in Path(f"artifacts/dist/{old_stem}").glob(
-            "*.dist-info/*",
+            "*.dist-info/**",
        ):
            change_content_to_new_version(file)

--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@ -6,12 +6,6 @@ inputs:
  cuda-version:
    description: which cuda version to install, 'cpu' for none
    required: true
-  python-version:
-    required: false
-    type: string
-    default: "3.10"
-    description: |
-      The python version to be used. Will be 3.10 by default

 runs:
  using: composite
@ -44,24 +38,18 @@ runs:
        CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"

        {
-          echo "CONDA=${CONDA}";
          echo "CONDA_RUN=${CONDA} run --no-capture-output";
          echo "CONDA_BUILD=${CONDA} run conda-build";
          echo "CONDA_INSTALL=${CONDA} install";
        } >> "${GITHUB_ENV}"

    - name: Setup Python3
-      env:
-          PYTHON_VERSION: ${{ inputs.python-version }}
      shell: bash
      run: |
        set +e
        set -x

-        # Create new py_tmp env with python-version
-        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp
-
-        PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
+        PYTHON3=$(${CONDA_RUN} which python3)
        EXIT_CODE=$?

        if [[ "${EXIT_CODE}" == "0" ]]; then
@ -74,7 +62,7 @@ runs:
          # installation, which is Python 3 based. Its Python is default to Python 3. Further, there
          # is also the Miniconda installation that is Python 2 based, and both can be installed if
          # needed. In both cases, Python binary is just called python
-          PYTHON=$(${CONDA_RUN} -n py_tmp which python)
+          PYTHON=$(${CONDA_RUN} which python)
          EXIT_CODE=$?

          if [[ "${EXIT_CODE}" == "0" ]]; then
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-87ff22e49ed0e92576c4935ccb8c143daac4a3cd
+fa5142928ee157aa65137c4ecff2fe9b1a9e0648
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-090197034faf3b193c4467cedeb9281e3078892d
+cc99baf14dacc2497d0c5ed84e076ef2c37f6a4d
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-c77852e117bdf056c8e9a087e51d6f65cf6ba53d
+6c5478ff7c3d50dd1e3047d72ec5909bea474073
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -82,10 +82,16 @@ RUN if command -v apt-get >/dev/null; then \
        apt-get update -y \
        && apt-get install -y ccache software-properties-common git curl wget sudo vim; \
    else \
-        dnf install -y git curl wget sudo; \
+        dnf install -y git curl wget sudo vim; \
    fi \
    && python3 --version && python3 -m pip --version

+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version >/dev/null 2>&1; then \
@ -214,16 +220,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
-        echo "Installing sccache..."; \
-        if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-            SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \
-        else \
-            SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \
-        fi; \
-        curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
        && tar -xzf sccache.tar.gz \
-        && sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
@ -284,7 +285,7 @@ RUN if command -v apt-get >/dev/null; then \
        && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
        && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
    else \
-        dnf install -y git curl wget sudo; \
+        dnf install -y git curl wget sudo vim; \
    fi \
    && python3 --version && python3 -m pip --version

@ -297,6 +298,12 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
    echo "[INFO] Showing torch_build_versions.txt content:" && \
    cat torch_build_versions.txt

+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version > /dev/null 2>&1; then \
--- a/.github/ci_configs/vllm/use_existing_torch.py
+++ b/.github/ci_configs/vllm/use_existing_torch.py
@ -1,17 +0,0 @@
-import glob
-
-
-requires_files = glob.glob("requirements/*.txt")
-requires_files += ["pyproject.toml"]
-for file in requires_files:
-    print(f">>> cleaning {file}")
-    with open(file) as f:
-        lines = f.readlines()
-    if "torch" in "".join(lines).lower():
-        print("removed:")
-        with open(file, "w") as f:
-            for line in lines:
-                if "torch" not in line.lower():
-                    f.write(line)
-    print(f"<<< done cleaning {file}")
-    print()
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -130,6 +130,3 @@
 - torch/csrc/inductor/aoti_include/**
 - torchgen/aoti/**
 - torchgen/gen_aoti_c_shim.py
-
-"ciflow/vllm":
- .github/ci_commit_pins/vllm.txt
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -36,7 +36,6 @@ ciflow_push_tags:
 - ciflow/win-arm64
 - ciflow/h100-symm-mem
 - ciflow/h100-cutlass-backend
- ciflow/b200
 retryable_workflows:
 - pull
 - trunk
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -15,7 +15,7 @@ optree==0.13.0
 packaging==23.1
 parameterized==0.8.1
 pillow==10.3.0
-protobuf==5.29.5
+protobuf==5.29.4
 psutil==5.9.8
 pygments==2.15.0
 pytest-cpp==2.3.0
@ -26,7 +26,7 @@ pytest-xdist==3.3.1
 pytest==7.3.2
 pyyaml==6.0.2
 scipy==1.12.0
-setuptools==78.1.1
+setuptools==72.1.0
 sympy==1.13.3
 tlparse==0.4.0
 tensorboard==2.13.0
--- a/.github/scripts/docathon-label-sync.py
+++ b/.github/scripts/docathon-label-sync.py
@ -39,9 +39,7 @@ def main() -> None:
    pull_request_label_names = [label.name for label in pull_request_labels]
    issue_label_names = [label.name for label in issue_labels]
    labels_to_add = [
-        label
-        for label in issue_label_names
-        if label not in pull_request_label_names and label != "actionable"
+        label for label in issue_label_names if label not in pull_request_label_names
    ]
    if not labels_to_add:
        print("The pull request already has the same labels.")
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -135,7 +135,7 @@ ROCM_SMOKE_WORKFLOWS = [
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
            arches=["6.4"],
-            python_versions=["3.10"],
+            python_versions=["3.9"],
        ),
        ciflow_config=CIFlowConfig(
            labels={
--- a/.github/scripts/prepare_vllm_wheels.sh
+++ b/.github/scripts/prepare_vllm_wheels.sh
@ -84,9 +84,6 @@ repackage_wheel() {
  rm -rf $package
 }

-# Require to re-package the wheel
-${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
-
 pushd externals/vllm/wheels
 for package in xformers flashinfer-python vllm; do
  repackage_wheel $package
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -187,6 +187,8 @@ jobs:

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        with:
+          driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
        if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}

      - name: configure aws credentials
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -75,6 +75,10 @@ jobs:
            runner: ${{ inputs.runner_prefix }}linux.2xlarge
            # It takes less than 30m to finish python docs unless there are issues
            timeout-minutes: 30
+          - docs_type: functorch
+            runner: ${{ inputs.runner_prefix }}linux.2xlarge
+            # It takes less than 15m to finish functorch docs unless there are issues
+            timeout-minutes: 15
    # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
    # The current name requires updating the database last docs push query from test-infra every time the matrix is updated
    name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
@ -207,6 +211,16 @@ jobs:
          path: cppdocs/
          s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs

+      - name: Upload functorch Docs Preview
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }}
+        with:
+          retention-days: 14
+          s3-bucket: doc-previews
+          if-no-files-found: error
+          path: functorch_ghpages/nightly/
+          s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs
+
      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
        if: always()
--- a/.github/workflows/_get-changed-files.yml
+++ b/.github/workflows/_get-changed-files.yml
@ -2,12 +2,6 @@ name: Get Changed Files

 on:
  workflow_call:
-    inputs:
-      all_files:
-        description: "Whether to return all files instead of just changed files"
-        required: false
-        type: boolean
-        default: false
    outputs:
      changed-files:
        description: "List of changed files (space-separated) or '*' if not in a PR"
@ -32,23 +26,17 @@ jobs:
            # Get the PR number from the github context
            PR_NUMBER="${{ github.event.number }}"

-            # Check if all_files is requested
-            if [ "${{ inputs.all_files }}" = "true" ]; then
-              echo "all_files input is true, returning all files"
-              echo "changed-files=*" >> "$GITHUB_OUTPUT"
-            else
-              # Use gh CLI to get changed files in the PR with explicit repo
-              CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
+            # Use gh CLI to get changed files in the PR with explicit repo
+            CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')

-              if [ -z "$CHANGED_FILES" ]; then
-                echo "No changed files found, setting to '*'"
-                CHANGED_FILES="*"
-              fi
-
-              echo "Changed files: $CHANGED_FILES"
-              echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
+            if [ -z "$CHANGED_FILES" ]; then
+              echo "No changed files found, setting to '*'"
+              CHANGED_FILES="*"
            fi

+            echo "Changed files: $CHANGED_FILES"
+            echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
+
          else
            echo "Not in PR context, setting changed files to '*'"
            echo "changed-files=*" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -169,7 +169,7 @@ jobs:
        id: install-nvidia-driver
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
        with:
-          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
+          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}

      - name: Setup GPU_FLAG for docker run
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -62,11 +62,6 @@ on:
        required: false
        type: number
        default: 1
-    secrets:
-      HUGGING_FACE_HUB_TOKEN:
-        required: false
-        description: |
-          HF Auth token to avoid rate limits when downloading models or datasets from hub
 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

@ -81,9 +76,10 @@ jobs:
    strategy:
      matrix: ${{ fromJSON(inputs.test-matrix) }}
      fail-fast: false
-    runs-on: ${{ matrix.runner }}
    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+    runs-on: ${{ matrix.runner }}
    steps:
+      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
@ -135,9 +131,6 @@ jobs:

      - name: Start monitoring script
        id: monitor-script
-        if: ${{ !inputs.disable-monitor }}
-        shell: bash
-        continue-on-error: true
        env:
          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
@ -145,6 +138,9 @@ jobs:
          WORKFLOW_RUN_ID: ${{github.run_id}}
          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
+        if: ${{ !inputs.disable-monitor }}
+        shell: bash
+        continue-on-error: true
        run: |
          python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
@ -182,12 +178,6 @@ jobs:
        run: |
          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"

-      - name: Preserve github env variables for use in docker
-        shell: bash
-        run: |
-          env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
-          env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
-
      - name: Test
        id: test
        env:
@ -203,22 +193,20 @@ jobs:
          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
-          TEST_CONFIG: ${{ matrix.config }}
-          SHARD_NUMBER: ${{ matrix.shard }}
-          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
+          TEST_CONFIG: ${{ matrix.config }}
+          SHARD_NUMBER: ${{ matrix.shard }}
+          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
          TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
        run: |
          set -x
@ -248,7 +236,6 @@ jobs:
            -e GITHUB_RUN_ATTEMPT \
            -e JOB_ID \
            -e JOB_NAME \
-            -e BASE_SHA \
            -e BRANCH \
            -e SHA1 \
            -e AWS_DEFAULT_REGION \
@ -266,12 +253,10 @@ jobs:
            -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
            -e TESTS_TO_INCLUDE \
-            -e HUGGING_FACE_HUB_TOKEN \
            -e DASHBOARD_TAG \
            --env-file="${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" \
            --ulimit stack=10485760:83886080 \
            --ulimit core=0 \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
            --shm-size="8g" \
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -151,7 +151,7 @@ jobs:
          BUILD_WHEEL: 1
          MAX_JOBS: 8
          CUDA_VERSION: ${{ inputs.cuda-version }}
-          PYTHON_VERSION: "3.10"
+          PYTHON_VERSION: "3.9"
          SCCACHE_BUCKET: "ossci-compiler-cache"
          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          SCCACHE_REGION: us-east-1
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -184,7 +184,7 @@ jobs:
        env:
          USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
          INSTALL_WINDOWS_SDK: 1
-          PYTHON_VERSION: "3.10"
+          PYTHON_VERSION: 3.9
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
--- a/.github/workflows/build-vllm-wheel.yml
+++ b/.github/workflows/build-vllm-wheel.yml
@ -12,9 +12,6 @@ on:
    paths:
      - .github/workflows/build-vllm-wheel.yml
      - .github/ci_commit_pins/vllm.txt
-  schedule:
-    # every morning at 01:30PM UTC, 9:30AM EST, 6:30AM PST
-    - cron: 30 13 * * *

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -27,33 +24,21 @@ jobs:
      fail-fast: false
      matrix:
        python-version: [ '3.12' ]
-        # TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
-        platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
+        # TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
        device: [ 'cu128', 'cu129' ]
+        runner: [ 'linux.12xlarge.memory' ]
        include:
-          - platform: manylinux_2_28_x86_64
-            device: cu128
+          - device: cu128
            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
-            runner: linux.12xlarge.memory
-          - platform: manylinux_2_28_x86_64
-            device: cu129
+          - device: cu129
            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
-            runner: linux.12xlarge.memory
-          - platform: manylinux_2_28_aarch64
-            device: cu128
-            manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
-            runner: linux.arm64.r7g.12xlarge.memory
-          - platform: manylinux_2_28_aarch64
-            device: cu129
-            manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
-            runner: linux.arm64.r7g.12xlarge.memory
-    name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
+    name: "Build ${{ matrix.device }} vLLM wheel"
    runs-on: ${{ matrix.runner }}
    timeout-minutes: 480
    env:
      PY_VERS: ${{ matrix.python-version }}
      MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
-      PLATFORM: ${{ matrix.platform }}
+      PLATFORM: 'manylinux_2_28_x86_64'
      BUILD_DEVICE: ${{ matrix.device }}
    steps:
      - name: Setup SSH (Click me for login details)
@ -151,7 +136,7 @@ jobs:

      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
        with:
-          name: vllm-wheel-${{ matrix.device }}-${{ matrix.platform }}-${{ matrix.python-version }}
+          name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
          if-no-files-found: error
          path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl

@ -161,29 +146,27 @@ jobs:

  # Copied from build-triton-wheel workflow (mostly)
  upload-wheel:
-    name: "Upload ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
+    name: "Upload ${{ matrix.device }} vLLM wheel"
    needs:
      - build-wheel
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
-        platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
        device: [ 'cu128', 'cu129' ]
    env:
-      PLATFORM: ${{ matrix.platform }}
      BUILD_DEVICE: ${{ matrix.device }}
    permissions:
      id-token: write
      contents: read
    container:
      image: continuumio/miniconda3:4.12.0
-    environment: ${{ ((github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'nightly-wheel-upload' || '' }}
+    environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Configure AWS credentials(PyTorch account) for main
-        if: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
+        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
@ -207,15 +190,15 @@ jobs:
        run: |
          set -eux
          mkdir -p "${RUNNER_TEMP}/artifacts/"
-          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-"${PLATFORM}"-*/* "${RUNNER_TEMP}/artifacts/"
+          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"

-      - name: Set DRY_RUN
-        if: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
        shell: bash
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"

-      - name: Set UPLOAD_CHANNEL
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
        shell: bash
        run: |
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -70,8 +70,9 @@ jobs:
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
-          pytorch-linux-jammy-py3-clang12-executorch,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
+          # Executorch pin needs update
+          # pytorch-linux-jammy-py3-clang12-executorch,
          pytorch-linux-jammy-py3.12-triton-cpu,
          pytorch-linux-noble-riscv64-py3.12-gcc14
        ]
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@ -44,7 +44,7 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_10-rocm6_4-build:
+  manywheel-py3_9-rocm6_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
@ -58,16 +58,16 @@ jobs:
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-rocm6_4
+      build_name: manywheel-py3_9-rocm6_4
      build_environment: linux-binary-manywheel-rocm
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-rocm6_4-test:  # Testing
+  manywheel-py3_9-rocm6_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
-      - manywheel-py3_10-rocm6_4-build
+      - manywheel-py3_9-rocm6_4-build
      - get-label-type
    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
@ -82,14 +82,14 @@ jobs:
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
-          name: manywheel-py3_10-rocm6_4
+          name: manywheel-py3_9-rocm6_4
          path: "${{ runner.temp }}/artifacts/"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@ -37,7 +37,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      test-matrix: |
@ -56,7 +56,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: nightly-dynamo-benchmarks-build
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
      test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
      timeout-minutes: 720
--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@ -80,7 +80,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -106,7 +106,7 @@ jobs:
    needs: inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
@ -122,8 +122,8 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: inductor-build
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
-      dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }}
+      build-environment: linux-jammy-py3.9-gcc11-build
+      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -80,7 +80,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -107,7 +107,7 @@ jobs:
    needs: inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
@ -124,7 +124,7 @@ jobs:
    needs: inductor-build
    if: github.event_name == 'workflow_dispatch'
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -39,7 +39,7 @@ jobs:
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.0;8.6'
+      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -62,7 +62,7 @@ jobs:
          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -154,7 +154,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      test-matrix: |
@ -200,7 +200,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: periodic-dynamo-benchmarks-cpu-build
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -110,7 +110,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
@ -127,7 +127,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: inductor-cpu-build
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -79,7 +79,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
@ -101,7 +101,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: inductor-cpu-build
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -31,8 +31,6 @@ jobs:
    if: github.repository_owner == 'pytorch'
    name: Get changed files
    uses: ./.github/workflows/_get-changed-files.yml
-    with:
-      all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}

  lintrunner-clang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -55,7 +53,7 @@ jobs:
    with:
      timeout: 120
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter
+      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
      fetch-depth: 0
@ -266,10 +264,10 @@ jobs:
        with:
          submodules: false
          fetch-depth: 1
-      - name: Setup Python 3.10
+      - name: Setup Python 3.9
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
-          python-version: '3.10'
+          python-version: '3.9'
          architecture: x64
          cache: pip
      - name: Install dependencies
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@ -14,10 +14,6 @@ on:
  schedule:
    # Run at 07:00 UTC every Sunday
    - cron: 0 7 * * 0
-  pull_request:
-    paths:
-      - benchmarks/operator_benchmark/**
-      - .github/workflows/operator_benchmark.yml

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -33,7 +29,7 @@ jobs:
    name: opbenchmark-build
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -46,7 +42,7 @@ jobs:
    name: opbenchmark-on-demand-build
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -59,7 +55,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: opbenchmark-build
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -316,6 +316,32 @@ jobs:
        ]}
    secrets: inherit

+  linux-jammy-py3-clang12-executorch-build:
+    if: false  # Docker build needs pin update
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
+      test-matrix: |
+        { include: [
+          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3-clang12-executorch-test:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3-clang12-executorch-build
+    if: false # Has been broken for a while
+    with:
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
    name: cuda12.8-py3.10-gcc9-sm75
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/test-b200.yml
+++ b/.github/workflows/test-b200.yml
@ -1,76 +0,0 @@
-# B200 Smoke Tests CI Workflow
-#
-# This workflow runs smoke tests on B200 hardware
-#
-# Flow:
-# 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
-# 2. Runs smoke tests on linux.dgx.b200 runner
-# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
-#
-# Triggered by:
-# - Pull requests modifying this workflow file
-# - Manual dispatch
-# - Schedule (every 6 hours)
-# - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag)
-
-name: B200 Smoke Tests
-
-on:
-  pull_request:
-    paths:
-      - .github/workflows/test-b200.yml
-  workflow_dispatch:
-  schedule:
-    - cron: 0 4,10,16,22 * * *  # every 6 hours
-  push:
-    tags:
-      - ciflow/b200/*
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-cuda12_8-py3_10-gcc11-sm100-build:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: '10.0'
-      test-matrix: |
-        { include: [
-          { config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
-        ]}
-      # config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
-      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-    secrets: inherit
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -240,7 +240,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-gcc11
+      build-environment: linux-jammy-py3.9-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -255,31 +255,7 @@ jobs:
      - verify-cachebench-cpu-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.10-gcc11
+      build-environment: linux-jammy-py3.9-gcc11
      docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
    secrets: inherit
-
-  linux-jammy-py3-clang12-executorch-build:
-    name: linux-jammy-py3-clang12-executorch
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3-clang12-executorch
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
-      test-matrix: |
-        { include: [
-          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3-clang12-executorch-test:
-    name: linux-jammy-py3-clang12-executorch
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3-clang12-executorch-build
-    with:
-      build-environment: linux-jammy-py3-clang12-executorch
-      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
-    secrets: inherit
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@ -53,3 +53,27 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-py3_9-clang9-xla-build:
+    name: linux-jammy-py3_9-clang9-xla
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.9-clang9-xla
+      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
+      test-matrix: |
+        { include: [
+          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3_9-clang9-xla-test:
+    name: linux-jammy-py3_9-clang9-xla
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3_9-clang9-xla-build
+    with:
+      build-environment: linux-jammy-py3.9-clang9-xla
+      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -36,8 +36,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata
-      allow-reuse-old-whl: false
      build-additional-packages: "vision audio"
      build-external-packages: "vllm"
      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
--- a/.gitignore
+++ b/.gitignore
@ -259,9 +259,6 @@ gen
 .pytest_cache
 aten/build/*

-# Linker scripts for prioritized text optimization
-cmake/linker_script.ld
-
 # Bram
 plsdontbreak

@ -392,5 +389,3 @@ android/pytorch_android_torchvision/.cxx

 # Claude Code local configuration
 CLAUDE.local.md
-/test_*.py
-/debug_*.py
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -13,7 +13,7 @@ exclude_patterns = [
    '**/fb/**',
    'functorch/docs/**',
    'functorch/examples/**',
-    'functorch/docs/source/tutorials/**',
+    'functorch/notebooks/**',
    'torch/_inductor/fx_passes/serialized_patterns/**',
    'torch/_inductor/autoheuristic/artifacts/**',
    'scripts/**',
@ -123,7 +123,6 @@ is_formatter = true
 code = 'MYPY'
 include_patterns = [
    'setup.py',
-    'functorch/dim/**/*.py',
    'torch/**/*.py',
    'torch/**/*.pyi',
    'caffe2/**/*.py',
@ -196,7 +195,6 @@ exclude_patterns = [
    'tools/test/gen_operators_yaml_test.py',
    'tools/test/gen_oplist_test.py',
    'tools/test/test_selective_build.py',
-    'tools/experimental/dynamic_shapes/torchfuzz/**',
 ]
 command = [
    'python3',
@ -966,6 +964,7 @@ exclude_patterns = [
    'test/jit/**',  # should be run through test/test_jit.py
    'test/ao/sparsity/**',  # should be run through test/test_ao_sparsity.py
    'test/fx/**',  # should be run through test/test_fx.py
+    'test/bottleneck_test/**',  # excluded by test/run_test.py
    'test/package/**',  # excluded by test/run_test.py
    'test/distributed/argparse_util_test.py',
    'test/distributed/bin/test_script.py',
@ -1411,6 +1410,8 @@ exclude_patterns = [
    'torch/utils/benchmark/utils/timer.py',
    'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
    'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
+    'torch/utils/bottleneck/__init__.py',
+    'torch/utils/bottleneck/__main__.py',
    'torch/utils/bundled_inputs.py',
    'torch/utils/checkpoint.py',
    'torch/utils/collect_env.py',
@ -1567,6 +1568,7 @@ include_patterns = [
 exclude_patterns = [
    'caffe2/**',
    'functorch/docs/**',
+    'functorch/notebooks/**',
    'torch/_inductor/fx_passes/serialized_patterns/**',
    'torch/_inductor/autoheuristic/artifacts/**',
    'test/dynamo/cpython/**',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -22,6 +22,7 @@ COMMON_COPTS = [
    "-DHAVE_SHM_UNLINK=1",
    "-D_FILE_OFFSET_BITS=64",
    "-DUSE_FBGEMM",
+    "-DUSE_DISTRIBUTED",
    "-DAT_PER_OPERATOR_HEADERS",
    "-DATEN_THREADING=NATIVE",
    "-DNO_CUDNN_DESTROY_HANDLE",
@ -810,7 +811,7 @@ cc_library(
    name = "torch_python",
    srcs = libtorch_python_core_sources
        + if_cuda(libtorch_python_cuda_sources)
-        + libtorch_python_distributed_sources
+        + if_cuda(libtorch_python_distributed_sources)
        + GENERATED_AUTOGRAD_PYTHON,
    hdrs = glob([
        "torch/csrc/generic/*.cpp",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,4 +1,5 @@
 cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
+# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)

 # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
 # sometimes makes XCode C compiler gets detected as "Clang", even when the C++
@ -180,9 +181,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
  set(CPU_POWER ON)
 endif()

-# For non-supported platforms, turn USE_DISTRIBUTED off by default.
-# NB: USE_DISTRIBUTED simply disables the backend; distributed code
-# still gets built
+# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
+# tested and likely won't work without additional changes.
 if(NOT LINUX AND NOT WIN32)
  set(USE_DISTRIBUTED
      OFF
@ -262,11 +262,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
-option(USE_DISTRIBUTED "Enable default distributed backends" ON)
+option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
-                       "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
+                       "USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@ -379,13 +379,6 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
                       OFF "USE_CUDA" OFF)
 cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
                        "CPU_AARCH64" OFF)
-# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
-set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
-if(LINUX AND CPU_AARCH64)
-  set(USE_PRIORITIZED_TEXT_DEFAULT ON)
-endif()
-cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
-  "${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)

 option(USE_MIMALLOC "Use mimalloc" OFF)
 # Enable third party mimalloc library to improve memory allocation performance
@ -438,10 +431,11 @@ if(WIN32)
      PATH_SUFFIXES lib
      NO_DEFAULT_PATH)
    if(NOT libuv_tmp_LIBRARY)
+      set(USE_DISTRIBUTED OFF)
      set(USE_GLOO OFF)
      message(
        WARNING
-          "Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
+          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
          "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
      )
    else()
@ -663,11 +657,6 @@ endif(MSVC)

 string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")

-# Set linker max-page-size to 64KiB on AArch64 Linux
-if(LINUX AND CPU_AARCH64)
-  add_link_options_if_supported("-z,max-page-size=0x10000")
-endif()
-
 # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
 # applicable to mobile are disabled by this variable. Setting
 # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
@ -885,7 +874,7 @@ cmake_dependent_option(
  "Whether to build the flash_attention kernel for scaled dot product attention.\
  Will be disabled if not supported by the platform"
  ON
-  "(USE_CUDA AND NOT MSVC) OR USE_ROCM"
+  "USE_CUDA OR USE_ROCM"
  OFF)

 cmake_dependent_option(
@ -902,7 +891,7 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
 endif()

 # Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
-if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
+if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
  message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
  set(USE_FBGEMM_GENAI ON)
 endif()
@ -1432,57 +1421,3 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
  install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
          DESTINATION "${CMAKE_INSTALL_BINDIR}")
 endif()
-
-if(USE_PRIORITIZED_TEXT_FOR_LD)
-  add_compile_options(
-    $<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
-    $<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
-  )
-  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
-  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
-
-  add_custom_command(
-    OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
-    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
-    DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
-    COMMENT "Generating prioritized text linker files"
-    VERBATIM
-  )
-
-  add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
-
-  if(BUILD_PYTHON)
-    set(LINKER_OPT_TARGETS torch_python)
-  endif()
-
-  if(NOT BUILD_LIBTORCHLESS)
-    list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
-    if(USE_CUDA)
-      list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
-    endif()
-    if(USE_XPU)
-      list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
-    endif()
-    if(USE_ROCM)
-      list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
-    endif()
-  endif()
-
-  foreach(tgt IN LISTS LINKER_OPT_TARGETS)
-    if(TARGET ${tgt})
-      add_dependencies("${tgt}" generate_linker_script)
-      target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
-      set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
-    else()
-       message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
-    endif()
-  endforeach()
-
-else()
-  if(LINUX AND CPU_AARCH64)
-    message(WARNING [[
-    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
-    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
-    ]])
-  endif()
-endif()
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -317,20 +317,10 @@ IF(USE_FBGEMM_GENAI)
        -greedy-reverse-local-assignment=1
        -fhip-new-launch-api)

-      # Only compile for gfx942 for now.
-      # This is rather hacky, I could not figure out a clean solution :(
-      set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
-      string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
-      if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
-        list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
-      endif()
-      set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
-
      hip_add_library(
        fbgemm_genai STATIC
        ${fbgemm_genai_native_rocm_hip}
        HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
-      set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
      set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
      target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)

--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -180,7 +180,7 @@ void Context::setUserEnabledNNPACK(bool e) {
 }

 bool Context::allowTF32CuDNN(const std::string& op) const {
-  if (op.empty()){
+  if (op.size() == 0){
    bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
    bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
    TORCH_CHECK(
@ -281,6 +281,9 @@ bool Context::userEnabledOverrideableSDP() const {

 static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
 static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
+#ifdef USE_ROCM
+static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
+#endif

 bool Context::checkCuBLASConfigDeterministic() {
  // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
@ -340,6 +343,12 @@ void Context::setImmediateMiopen(bool b) {
 }

 bool Context::allowTF32CuBLAS() const {
+#ifdef USE_ROCM
+    const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
+    if (allow_tf32 != true) {
+      return false;
+    }
+#endif
  bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
  bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
  TORCH_CHECK(
@ -353,6 +362,14 @@ bool Context::allowTF32CuBLAS() const {
 }

 void Context::setAllowTF32CuBLAS(bool b) {
+#ifdef USE_ROCM
+  const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
+  if (allow_tf32 != true) {
+    C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. "
+                              << "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it.";
+    return;
+  }
+#endif
  float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
  setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
 }
@ -426,7 +443,7 @@ void Context::setFloat32Precision(const std::string& backend, const std::string&
    std::string msg;
    auto iterp = _fp32_precisions.find(backend);
    TORCH_CHECK(iterp != _fp32_precisions.end());
-    for (const auto& p : iterp->second) {
+    for (auto p : iterp->second) {
      msg += p;
      msg += " ";
    }
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -65,24 +65,14 @@ DLDataType getDLDataType(const Tensor& t) {
      break;
    // TODO(#146647): use macro here instead of spelling out each shell dtype
    case ScalarType::Float8_e5m2:
-      dtype.code = DLDataTypeCode::kDLFloat8_e5m2;
-      break;
    case ScalarType::Float8_e5m2fnuz:
-      dtype.code = DLDataTypeCode::kDLFloat8_e5m2fnuz;
-      break;
    case ScalarType::Float8_e4m3fn:
-      dtype.code = DLDataTypeCode::kDLFloat8_e4m3fn;
-      break;
    case ScalarType::Float8_e4m3fnuz:
-      dtype.code = DLDataTypeCode::kDLFloat8_e4m3fnuz;
-      break;
    case ScalarType::Float8_e8m0fnu:
-      dtype.code = DLDataTypeCode::kDLFloat8_e8m0fnu;
+      TORCH_CHECK_BUFFER(false, "float8 types are not supported by dlpack");
      break;
    case ScalarType::Float4_e2m1fn_x2:
-      dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn;
-      dtype.lanes = 2;
-      dtype.bits = 4;
+      TORCH_CHECK_BUFFER(false, "float4 types are not supported by dlpack");
      break;
    case ScalarType::QInt8:
    case ScalarType::QUInt8:
@ -187,11 +177,7 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat

 ScalarType toScalarType(const DLDataType& dtype) {
  ScalarType stype = ScalarType::Undefined;
-  if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) {
-    TORCH_CHECK_BUFFER(
-        dtype.lanes == 1,
-        "ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code));
-  }
+  TORCH_CHECK_BUFFER(dtype.lanes == 1, "ATen does not support lanes != 1");
  switch (dtype.code) {
    case DLDataTypeCode::kDLUInt:
      switch (dtype.bits) {
@ -283,73 +269,6 @@ ScalarType toScalarType(const DLDataType& dtype) {
              false, "Unsupported kDLBool bits ", std::to_string(dtype.bits));
      }
      break;
-    case DLDataTypeCode::kDLFloat8_e5m2:
-      switch (dtype.bits) {
-        case 8:
-          stype = ScalarType::Float8_e5m2;
-          break;
-        default:
-          TORCH_CHECK_BUFFER(
-              false, "Unsupported kDLFloat8_e5m2 bits ", std::to_string(dtype.bits));
-      }
-      break;
-    case DLDataTypeCode::kDLFloat8_e5m2fnuz:
-      switch (dtype.bits) {
-        case 8:
-          stype = ScalarType::Float8_e5m2fnuz;
-          break;
-        default:
-          TORCH_CHECK_BUFFER(
-              false, "Unsupported kDLFloat8_e5m2fnuz bits ", std::to_string(dtype.bits));
-      }
-      break;
-    case DLDataTypeCode::kDLFloat8_e4m3fn:
-      switch (dtype.bits) {
-        case 8:
-          stype = ScalarType::Float8_e4m3fn;
-          break;
-        default:
-          TORCH_CHECK_BUFFER(
-              false, "Unsupported kDLFloat8_e4m3fn bits ", std::to_string(dtype.bits));
-      }
-      break;
-    case DLDataTypeCode::kDLFloat8_e4m3fnuz:
-      switch (dtype.bits) {
-        case 8:
-          stype = ScalarType::Float8_e4m3fnuz;
-          break;
-        default:
-          TORCH_CHECK_BUFFER(
-              false, "Unsupported kDLFloat8_e4m3fnuz bits ", std::to_string(dtype.bits));
-      }
-      break;
-    case DLDataTypeCode::kDLFloat8_e8m0fnu:
-      switch (dtype.bits) {
-        case 8:
-          stype = ScalarType::Float8_e8m0fnu;
-          break;
-        default:
-          TORCH_CHECK_BUFFER(
-              false, "Unsupported kDLFloat8_e8m0fnu bits ", std::to_string(dtype.bits));
-      }
-      break;
-    case DLDataTypeCode::kDLFloat4_e2m1fn:
-      switch (dtype.bits) {
-        case 4:
-          switch (dtype.lanes) {
-            case 2:
-              stype = ScalarType::Float4_e2m1fn_x2;
-              break;
-            default:
-              TORCH_CHECK_BUFFER(
-                false, "Unsupported kDLFloat4_e2m1fn lanes ", std::to_string(dtype.lanes));
-          }
-          break;
-        default:
-          TORCH_CHECK_BUFFER(
-              false, "Unsupported kDLFloat4_e2m1fn bits ", std::to_string(dtype.bits));
-      }
-      break;
    default:
      TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code));
  }
@ -401,13 +320,30 @@ T* toDLPackImpl(const Tensor& src) {
  // The following code detects whether the src follows
  // a continuous pattern. If the src follows such pattern (common-case)
  // then we do not need to normalize the strides.
-  bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1;
+  bool need_normalize_strides = false;
+  int64_t expected_stride = 1;
+  for (int i = src.dim() - 1; i >= 0; i--) {
+    // detect if we do not meet continuous pattern
+    // and the size is 1, so there is opportunity to normalize
+    if (src.stride(i) != expected_stride && src.size(i) == 1) {
+      need_normalize_strides = true;
+      break;
+    }
+    expected_stride *= src.size(i);
+  }
+
  // less common case, try normalizing the strides
  if (need_normalize_strides) {
    // create a new tensor with possibly normalized strides
    // gh-83069
    auto shape = src.sizes();
-    view = src.as_strided(shape, {1}, src.storage_offset());
+    auto strides = src.strides().vec();
+    for (int i = 0; i < src.dim(); i++) {
+      if (shape[i] < 2) {
+        strides[i] = 1;
+      }
+    }
+    view = src.as_strided(shape, strides, src.storage_offset());
  }

  ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
@ -418,8 +354,8 @@ T* toDLPackImpl(const Tensor& src) {
  atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
  atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
  atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
-  atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data());
-  atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data());
+  atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
+  atDLMTensor->tensor.dl_tensor.strides = view.strides().data();
  atDLMTensor->tensor.dl_tensor.byte_offset = 0;
  fillVersion(&atDLMTensor->tensor);

--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@ -102,7 +102,7 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
  // SparseTensorImpl has no storage, so we cannot query its nbytes.
  // (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse)
  // Same for XLA
-  if (base.unsafeGetTensorImpl()->has_storage() && data_ptr().device().type() != c10::DeviceType::XLA) {
+  if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) {
    original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
  } else {
    original_storage_size_ = -1;
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -133,7 +133,7 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
  : c10::TensorImpl(
      c10::DispatchKeySet(DispatchKey::Functionalize),
      view_value.dtype(),
-      base->storage().data_ptr().device()
+      view_value.device()
    ),
    value_(view_value),
    is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
@ -485,10 +485,7 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr<TensorI


 c10::Device FunctionalTensorWrapper::device_custom() const {
-  // The storage pointer already uses the underlying tensor custom device (if
-  // applicable) to extract the device. So, we dont have to recurse again by
-  // doing value_.unsafeGetTensorImpl()->device().
-  return storage().data_ptr().device();
+  return value_.unsafeGetTensorImpl()->device();
 }
 at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const {
  return value_.unsafeGetTensorImpl()->sizes();
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@ -10,6 +10,10 @@
 #include <ideep.hpp>
 #endif

+#if !defined(__s390x__) && !defined(__powerpc__)
+#include <cpuinfo.h>
+#endif
+
 #include <caffe2/core/common.h>

 #include <ATen/native/DispatchStub.h>
@ -103,7 +107,9 @@ std::string get_cpu_capability() {
 #elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
    case native::CPUCapability::ZVECTOR:
      return "Z VECTOR";
-#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
+#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
+    case native::CPUCapability::SVE:
+      return "SVE";
    case native::CPUCapability::SVE256:
      return "SVE256";
 #else
@ -118,6 +124,12 @@ std::string get_cpu_capability() {
  return "";
 }

+int get_sve_len() {
+  // It is possible that we override the cpu_capability with
+  // environment variable
+  return cpuinfo_get_max_arm_sve_length();
+}
+
 static std::string used_cpu_capability() {
  // It is possible that we override the cpu_capability with
  // environment variable
--- a/aten/src/ATen/Version.h
+++ b/aten/src/ATen/Version.h
@ -15,4 +15,6 @@ TORCH_API std::string get_cxx_flags();

 TORCH_API std::string get_cpu_capability();

+TORCH_API int get_sve_len();
+
 } // namespace at
--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@ -34,9 +34,9 @@ inline scalar_t vec_reduce_all(
  scalar_t acc_arr[Vec::size()];
  acc_vec.store(acc_arr);
  for (const auto i : c10::irange(1, size)) {
-    std::array<scalar_t, Vec::size()> acc_arr_next = {0};
+    scalar_t acc_arr_next[Vec::size()] = {0};
    acc_arr_next[0] = acc_arr[i];
-    Vec acc_vec_next = Vec::loadu(acc_arr_next.data());
+    Vec acc_vec_next = Vec::loadu(acc_arr_next);
    acc_vec = vec_fun(acc_vec, acc_vec_next);
  }
  acc_vec.store(acc_arr);
@ -102,8 +102,7 @@ struct VecReduceAllSIMD<float, Op> {
 #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
       // !defined(C10_MOBILE)

-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    !defined(CPU_CAPABILITY_SVE)
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
  static inline float apply(
@ -143,8 +142,7 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
 #endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
       // && !defined(CPU_CAPABILITY_SVE)

-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    defined(CPU_CAPABILITY_SVE256)
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE))
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
  static inline float apply(
@ -152,18 +150,28 @@ struct VecReduceAllSIMD<float, Op> {
      const Vectorized<float>& acc_vec) {
    using Vec = Vectorized<float>;
    Vec v = acc_vec;
-    // 128-bit shuffle
-    svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
-    Vec v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    // 64-bit shuffle
-    ind = svdupq_n_u32(2, 3, 0, 1);
-    v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    // 32-bit shuffle
-    ind = svdupq_n_u32(1, 0, 2, 3);
-    v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
+    if (Vec::size() == 8) {
+      // 128-bit shuffle
+      svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
+      Vec v1 = svtbl_f32(v, ind);
+      v = vec_fun(v, v1);
+      // 64-bit shuffle
+      ind = svdupq_n_u32(2, 3, 0, 1);
+      v1 = svtbl_f32(v, ind);
+      v = vec_fun(v, v1);
+      // 32-bit shuffle
+      ind = svdupq_n_u32(1, 0, 2, 3);
+      v1 = svtbl_f32(v, ind);
+      v = vec_fun(v, v1);
+    } else {
+      svuint32_t ind = svdupq_n_u32(2, 3, 0, 1);  // 64-bit stride-2
+      Vec v1 = svtbl_f32(v, ind);
+      v = vec_fun(v, v1);
+
+      ind = svdupq_n_u32(1, 0, 2, 3);             // 32-bit stride-1
+      v1 = svtbl_f32(v, ind);
+      v = vec_fun(v, v1);
+    }
    return svlasta(svpfalse(), v);
  }
 };
--- a/aten/src/ATen/cpu/vec/sve/sve_helper.h
+++ b/aten/src/ATen/cpu/vec/sve/sve_helper.h
@ -4,7 +4,7 @@

 #include <ATen/cpu/vec/vec_base.h>

-#if defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)

 // Define the data type of VLS(vector-length specific).
 typedef svbool_t vls_pred_t
@ -77,4 +77,4 @@ typedef svfloat64_t vls_float64_t
 #define ALL_F64_TRUE_MASK svreinterpret_f64_s64(ALL_S64_TRUE_MASK)
 #define ALL_F64_FALSE_MASK svreinterpret_f64_s64(ALL_S64_FALSE_MASK)

-#endif // defined(CPU_CAPABILITY_SVE)
+#endif // defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
@ -19,7 +19,7 @@ namespace vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
+#if (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)) && defined(__ARM_FEATURE_BF16)

 template <>
 struct is_vec_specialized_for<BFloat16> : std::bool_constant<true> {};
@ -230,8 +230,6 @@ __attribute__((optimize("no-tree-vectorize")))
 #endif
 inline std::tuple<Vectorized<float>, Vectorized<float>>
 convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
-  static_assert(
-      Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
  auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
  auto bf16_vec1 = svzip1_bf16(zero, a);
  auto bf16_vec2 = svzip2_bf16(zero, a);
@ -243,19 +241,18 @@ convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
 inline Vectorized<c10::BFloat16> convert_float_bfloat16(
    const Vectorized<float>& a,
    const Vectorized<float>& b) {
-  static_assert(
-      Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
  svbfloat16_t x1 = svcvt_bf16_f32_z(ptrue, a);
  svbfloat16_t x2 = svcvt_bf16_f32_z(ptrue, b);
  return Vectorized<c10::BFloat16>(svuzp1_bf16(x1, x2));
 }

 inline void load_fp32_from_bf16(const BFloat16* data, Vectorized<float>& out) {
-  __at_align__ float values[Vectorized<float>::size()];
+  __at_align__ float * values = new float[Vectorized<float>::size()];
  for (const auto k : c10::irange(Vectorized<float>::size())) {
    values[k] = data[k];
  }
  out = Vectorized<float>::loadu(values);
+  delete[] values;
 }

 inline void load_fp32_from_bf16(
@ -308,8 +305,8 @@ Vectorized<c10::BFloat16> inline operator/(
 }

 inline Vectorized<BFloat16>::Vectorized() {
-  const short zero = 0;
-  values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero));
+  auto vals_f = svdup_n_f32(0);
+  values = convert_float_bfloat16(vals_f, vals_f);
 }

 inline Vectorized<BFloat16>::Vectorized(int val) {
--- a/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
@ -8,7 +8,7 @@
 #include <ATen/cpu/vec/sve/sve_helper.h>
 #include <ATen/cpu/vec/vec_base.h>

-#if defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
 #include <ATen/cpu/vec/sve/vec_bfloat16.h>
 #include <ATen/cpu/vec/sve/vec_double.h>
 #include <ATen/cpu/vec/sve/vec_float.h>
@ -27,7 +27,7 @@ namespace at::vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #define DEFINE_SVE_CAST(t1_t, t1_prefix, t2_t, t2_prefix)                 \
@ -231,6 +231,5 @@ std::pair<
 #endif // __ARM_FEATURE_BF16

 #endif // defined(CPU_CAPABILITY_SVE)
-
 } // namespace CPU_CAPABILITY
-} // namespace at::vec
+}
--- a/aten/src/ATen/cpu/vec/sve/vec_double.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_double.h
@ -22,7 +22,7 @@ namespace at::vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)

 template <>
 struct is_vec_specialized_for<double> : std::bool_constant<true> {};
@ -55,10 +55,11 @@ class Vectorized<double> {
  operator svfloat64_t() const {
    return values;
  }
-  template <uint64_t mask>
  static Vectorized<double> blend(
      const Vectorized<double>& a,
-      const Vectorized<double>& b) {
+      const Vectorized<double>& b,
+      int64_t mask
+    ) {
    // Build an array of flags: each element is 1 if the corresponding bit in
    // 'mask' is set, 0 otherwise.
    __at_align__ int64_t flag_arr[size()];
--- a/aten/src/ATen/cpu/vec/sve/vec_float.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_float.h
@ -2,8 +2,10 @@

 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/sve/sve_helper.h>
-#include <ATen/cpu/vec/vec_base.h>
+
+#include <algorithm>
 #include <cmath>
+
 #if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
 #include <sleef.h>
 #define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
@ -22,7 +24,7 @@ namespace at::vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)

 template <>
 struct is_vec_specialized_for<float> : std::bool_constant<true> {};
@ -30,52 +32,77 @@ struct is_vec_specialized_for<float> : std::bool_constant<true> {};
 template <>
 class Vectorized<float> {
 private:
-  vls_float32_t values;
-
+    __at_align__ float values[2048 / sizeof(float)];
 public:
+
  using value_type = float;
  using size_type = int;
-  static constexpr size_type size() {
-    return VECTOR_WIDTH / sizeof(float);
+  static inline size_type size() {
+    return svcntw();
  }
-  Vectorized() {
-    values = svdup_n_f32(0);
+  inline Vectorized() {svst1_f32(ptrue, values, svdup_n_f32(0));}
+  inline Vectorized(const float val) {
+    svst1_f32(ptrue, values, svdup_n_f32(val));
  }
-  Vectorized(svfloat32_t v) : values(v) {}
-  Vectorized(float val) {
-    values = svdup_n_f32(val);
+  inline Vectorized(const svfloat32_t val) {
+    svst1_f32(ptrue, values, val);
  }
-  template <
-      typename... Args,
-      typename = std::enable_if_t<(sizeof...(Args) == size())>>
-  Vectorized(Args... vals) {
-    __at_align__ float buffer[size()] = {vals...};
-    values = svld1_f32(ptrue, buffer);
+  template<typename T,
+           typename = std::enable_if_t<std::is_pointer_v<T>>>
+  inline Vectorized(float * val) {
+    svst1_f32(ptrue, values, svld1_f32(ptrue, val));
  }
-  operator svfloat32_t() const {
-    return values;
+  template<typename... Args,
+           typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  inline Vectorized(Args... vals) {
+    values = { vals... };
  }
-  template <uint64_t mask>
-  static Vectorized<float> blend(
-      const Vectorized<float>& a,
-      const Vectorized<float>& b) {
-    // Build an array of flags: each element is 1 if the corresponding bit in
-    // 'mask' is set, 0 otherwise.
-    __at_align__ int32_t flag_arr[size()];
+  inline operator svfloat32_t() const {
+    return svld1_f32(ptrue, values);
+  }
+  static inline Vectorized<float> from_ptr(const float * vs) {
+    Vectorized<float> v;
+    svst1_f32(ptrue, v.values, svld1_f32(ptrue, static_cast<const float *>(vs)));
+    return v;
+  }
+  static inline Vectorized<float> from_ptr(const float * vs, int count) {
+    Vectorized<float> v;
+    svst1_f32(ptrue, v.values, svld1_f32(svwhilelt_b32_s32(0, count), static_cast<const float *>(vs)));
+    return v;
+  }
+  inline void set_lane(int i, float value) {
+    values[i] = value;
+  }
+  inline Vectorized<float> map(float (*fn)(float)) const {
+    Vectorized<float> result;
+    for (int64_t i = 0; i < size(); ++i) {
+      result.set_lane(i, fn(values[i]));
+    }
+    return result;
+  }
+  inline Vectorized<float> map2(float (*fn)(float, float), const Vectorized<float> &b) const {
+    Vectorized<float> result;
+    for (int64_t i = 0; i < size(); ++i) {
+      result.set_lane(i, fn(values[i], b.values[i]));
+    }
+    return result;
+  }
+
+  static inline Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b, const uint64_t mask) {
+    // Build an array of flags: each element is 1 if the corresponding bit in 'mask' is set, 0 otherwise.
+    __at_align__ int32_t * flag_arr = new int32_t[size()];
    for (int i = 0; i < size(); i++) {
      flag_arr[i] = (mask & (1ULL << i)) ? 1 : 0;
    }
    // Load the flag array into an SVE int32 vector.
-    svint32_t int_mask = svld1_s32(svptrue_b32(), flag_arr);
-    // Compare each lane of int_mask to 0; returns an svbool_t predicate where
-    // true indicates a nonzero flag.
-    svbool_t blend_mask = svcmpne_n_s32(svptrue_b32(), int_mask, 0);
-    // Use svsel to select elements from b where the predicate is true, else
-    // from a.
-    svfloat32_t result = svsel_f32(blend_mask, b.values, a.values);
-    return Vectorized<float>(result);
+    svint32_t int_mask = svld1_s32(ptrue, flag_arr);
+    delete[] flag_arr;
+    // Compare each lane of int_mask to 0; returns an svbool_t predicate where true indicates a nonzero flag.
+    svbool_t blend_mask = svcmpne_n_s32(ptrue, int_mask, 0);
+    // Use svsel to select elements from b where the predicate is true, else from a.
+    return svsel_f32(blend_mask, b, a);
  }
-  static Vectorized<float> blendv(
+  static inline Vectorized<float> blendv(
      const Vectorized<float>& a,
      const Vectorized<float>& b,
      const Vectorized<float>& mask_) {
@ -84,16 +111,18 @@ class Vectorized<float> {
    return svsel_f32(mask, b, a);
  }
  template <typename step_t>
-  static Vectorized<float> arange(
+  static inline Vectorized<float> arange(
      float base = 0.f,
      step_t step = static_cast<step_t>(1)) {
-    __at_align__ float buffer[size()];
+    __at_align__ float * buffer = new float[size()];
    for (int64_t i = 0; i < size(); i++) {
      buffer[i] = base + i * step;
    }
-    return svld1_f32(ptrue, buffer);
+    auto tmp = Vectorized<float>::from_ptr(buffer);
+    delete[] buffer;
+    return tmp;
  }
-  static Vectorized<float> set(
+  static inline Vectorized<float> set(
      const Vectorized<float>& a,
      const Vectorized<float>& b,
      int64_t count = size()) {
@ -169,271 +198,213 @@ class Vectorized<float> {
    poly = svsel_f32(svcmpgt_f32(pg, x, max_input), inf, poly);
    return poly;
  }
-  static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
-    if (count == size())
-      return svld1_f32(ptrue, reinterpret_cast<const float*>(ptr));
-    svbool_t pg = svwhilelt_b32(0ull, count);
-    return svld1_f32(pg, reinterpret_cast<const float*>(ptr));
+  static inline Vectorized<float> loadu(const void* ptr) {
+    return Vectorized<float>::from_ptr(reinterpret_cast<const float *>(ptr));
  }
-  void store(void* ptr, int64_t count = size()) const {
-    if (count == size()) {
-      svst1_f32(ptrue, reinterpret_cast<float*>(ptr), values);
-    } else {
-      svbool_t pg = svwhilelt_b32(0ull, count);
-      svst1_f32(pg, reinterpret_cast<float*>(ptr), values);
-    }
+  static inline Vectorized<float> loadu(const void* ptr, int64_t count) {
+    return Vectorized<float>::from_ptr(reinterpret_cast<const float *>(ptr), count);
  }
-  const float& operator[](int idx) const = delete;
-  float& operator[](int idx) = delete;
-  int64_t zero_mask() const {
-    // returns an integer mask where all zero elements are translated to 1-bit
-    // and others are translated to 0-bit
+  inline void store(void* ptr) const {
+    svst1_f32(ptrue, static_cast<float *>(ptr), svld1_f32(ptrue, values));
+  }
+  inline void store(void* ptr, int count) const {
+    svst1_f32(svwhilelt_b32_s32(0, count), static_cast<float *>(ptr), svld1_f32(ptrue, values));
+  }
+  inline const float& operator[](int idx) const {
+    return values[idx];
+  };
+  inline float& operator[](int idx) {
+    return values[idx];
+  };
+  inline int64_t zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
    int64_t mask = 0;
-    __at_align__ int32_t mask_array[size()];
+    __at_align__ int32_t * mask_array = new int32_t[size()];

-    svbool_t svbool_mask = svcmpeq_f32(ptrue, values, ZERO_F32);
-    svst1_s32(
-        ptrue,
-        mask_array,
-        svsel_s32(svbool_mask, ALL_S32_TRUE_MASK, ALL_S32_FALSE_MASK));
-    for (int64_t i = 0; i < size(); ++i) {
-      if (mask_array[i])
-        mask |= (1ull << i);
+    svbool_t svbool_mask = svcmpeq_f32(ptrue, *this, ZERO_F32);
+    svst1_s32(ptrue, mask_array, svsel_s32(svbool_mask,
+                                          ALL_S32_TRUE_MASK,
+                                          ALL_S32_FALSE_MASK));
+    for (int64_t j = 0; j < size(); ++j) {
+      if (mask_array[j]) mask |= (1ull << j);
    }
+    delete[] mask_array;
    return mask;
  }
-  Vectorized<float> isnan() const {
+  inline Vectorized<float> isnan() const {
    // NaN check
-    svbool_t mask = svcmpuo_f32(ptrue, values, ZERO_F32);
+    auto mask = svcmpuo_f32(ptrue, *this, ZERO_F32);
    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
  }
-  bool has_inf_nan() const {
-    return svptest_any(
-        ptrue,
-        svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32));
+  inline bool has_inf_nan() const {
+    return svptest_any(ptrue, svcmpuo_f32(ptrue, svsub_f32_x(ptrue, *this, *this), ZERO_F32));
  }
-  Vectorized<float> map(float (*f)(float)) const {
-    __at_align__ float tmp[size()];
-    store(tmp);
-    for (int64_t i = 0; i < size(); ++i) {
-      tmp[i] = f(tmp[i]);
-    }
-    return loadu(tmp);
+  
+  inline Vectorized<float> abs() const {
+    return svabs_f32_x(ptrue, *this);
  }
-  Vectorized<float> abs() const {
-    return svabs_f32_x(ptrue, values);
-  }
-  Vectorized<float> angle() const {
+  inline Vectorized<float> angle() const {
    const auto nan_vec = svdup_n_f32(NAN);
-    const auto nan_mask = svcmpuo_f32(ptrue, values, ZERO_F32);
+    const auto nan_mask = svcmpuo_f32(ptrue, *this, ZERO_F32);
    const auto pi = svdup_n_f32(c10::pi<float>);
-
-    const auto neg_mask = svcmplt_f32(ptrue, values, ZERO_F32);
+    const auto neg_mask = svcmplt_f32(ptrue, *this, ZERO_F32);
    auto angle = svsel_f32(neg_mask, pi, ZERO_F32);
-    angle = svsel_f32(nan_mask, nan_vec, angle);
-    return angle;
+    return svsel_f32(nan_mask, nan_vec, angle);
  }
-  Vectorized<float> real() const {
-    return values;
+  inline Vectorized<float> real() const {
+    return *this;
  }
-  Vectorized<float> imag() const {
+  inline Vectorized<float> imag() const {
    return Vectorized<float>(0.f);
  }
-  Vectorized<float> conj() const {
-    return values;
+  inline Vectorized<float> conj() const {
+    return *this;
  }
-  Vectorized<float> acos() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_acosfx_u10sve(values)), map(std::acos));
+  inline Vectorized<float> acos() const {
+    return USE_SLEEF(Sleef_acosfx_u10sve(*this), map(std::acos));
  }
-  Vectorized<float> acosh() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_acoshfx_u10sve(values)), map(std::acosh));
+  inline Vectorized<float> acosh() const {
+    return USE_SLEEF(Sleef_acoshfx_u10sve(*this), map(std::acosh));
  }
-  Vectorized<float> asin() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_asinfx_u10sve(values)), map(std::asin));
+  inline Vectorized<float> asin() const {
+    return USE_SLEEF(Sleef_asinfx_u10sve(*this), map(std::asin));
  }
-  Vectorized<float> asinh() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_asinhfx_u10sve(values)), map(std::asinh));
+  inline Vectorized<float> asinh() const {
+    return USE_SLEEF(Sleef_asinhfx_u10sve(*this), map(std::asinh));
  }
-  Vectorized<float> atan() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_atanfx_u10sve(values)), map(std::atan));
+  inline Vectorized<float> atan() const {
+    return USE_SLEEF(Sleef_atanfx_u10sve(*this), map(std::atan));
  }
-  Vectorized<float> atanh() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_atanhfx_u10sve(values)), map(std::atanh));
+  inline Vectorized<float> atanh() const {
+    return USE_SLEEF(Sleef_atanhfx_u10sve(*this), map(std::atanh));
  }
-  Vectorized<float> atan2(const Vectorized<float>& b) const {USE_SLEEF(
-      { return Vectorized<float>(Sleef_atan2fx_u10sve(values, b)); },
-      {
-        __at_align__ float tmp[size()];
-        __at_align__ float tmp_b[size()];
-        store(tmp);
-        b.store(tmp_b);
-        for (int64_t i = 0; i < size(); i++) {
-          tmp[i] = std::atan2(tmp[i], tmp_b[i]);
-        }
-        return loadu(tmp);
-      })} Vectorized<float> copysign(const Vectorized<float>& sign) const {
-
-      USE_SLEEF(
-          { return Vectorized<float>(Sleef_copysignfx_sve(values, sign)); },
-          {
-            __at_align__ float tmp[size()];
-            __at_align__ float tmp_sign[size()];
-            store(tmp);
-            sign.store(tmp_sign);
-            for (int64_t i = 0; i < size(); ++i) {
-              tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
-            }
-            return loadu(tmp);
-          })} Vectorized<float> erf() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_erffx_u10sve(values)), map(std::erf));
+  inline Vectorized<float> atan2(const Vectorized<float> &b) const {
+    return USE_SLEEF(Sleef_atan2fx_u10sve(*this, b), map2(std::atan2, b));
  }
-  Vectorized<float> erfc() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_erfcfx_u15sve(values)), map(std::erfc));
+  inline Vectorized<float> copysign(const Vectorized<float> &sign) const {
+    return USE_SLEEF(Sleef_copysignfx_sve(*this, sign), map2(std::copysign, sign));
  }
-  Vectorized<float> erfinv() const {
+  inline Vectorized<float> erf() const {
+    return USE_SLEEF(Sleef_erffx_u10sve(*this), map(std::erf));
+  }
+  inline Vectorized<float> erfc() const {
+    return USE_SLEEF(Sleef_erfcfx_u15sve(*this), map(std::erfc));
+  }
+  inline Vectorized<float> erfinv() const {
    return map(calc_erfinv);
  }
-  Vectorized<float> exp() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_expfx_u10sve(values)), map(std::exp));
+  inline Vectorized<float> exp() const {
+    return USE_SLEEF(Sleef_expfx_u10sve(*this), map(std::exp));
  }
-  Vectorized<float> exp2() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_exp2fx_u10sve(values)), map(std::exp2));
+  inline Vectorized<float> exp2() const {
+    return USE_SLEEF(Sleef_exp2fx_u10sve(*this), map(std::exp2));
  }
-  Vectorized<float> expm1() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_expm1fx_u10sve(values)), map(std::expm1));
+  inline Vectorized<float> expm1() const {
+    return USE_SLEEF(Sleef_expm1fx_u10sve(*this), map(std::expm1));
  }
+  // Implementation copied from Arm Optimized Routines: 
+  // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/sve/expf.c
  Vectorized<float> exp_u20() const {
-    return exp();
+    // special case to handle special inputs that are too large or too small
+    // i.e. where there's at least one element x, s.t. |x| >= 87.3...
+    svbool_t is_special_case = svacgt (svptrue_b32(), *this, 0x1.5d5e2ap+6f);
+    if (svptest_any (svptrue_b32(), is_special_case)) {
+      return exp();
+    }
+    const svfloat32_t ln2_hi = svdup_n_f32(0x1.62e4p-1f);    
+    const svfloat32_t ln2_lo = svdup_n_f32(0x1.7f7d1cp-20f);    
+    const svfloat32_t c1 = svdup_n_f32(0.5f);    
+    const svfloat32_t inv_ln2 = svdup_n_f32(0x1.715476p+0f);
+
+    const float shift = 0x1.803f8p17f;    
+
+    /* n = round(x/(ln2/N)).  */
+    svfloat32_t z = svmad_x (svptrue_b32(), inv_ln2, *this, shift);
+    svfloat32_t n = svsub_x (svptrue_b32(), z, shift);
+
+    /* r = x - n*ln2/N.  */
+    svfloat32_t r = *this;
+    r = svmls_x(svptrue_b32(), r, n, ln2_hi);
+    r = svmls_x(svptrue_b32(), r, n, ln2_lo);
+
+    /* scale = 2^(n/N).  */
+    svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+
+    /* poly(r) = exp(r) - 1 ~= r + 0.5 r^2.  */
+    svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+    svfloat32_t poly = svmla_x(svptrue_b32(), r, r2, c1);
+    return svmla_x (svptrue_b32(), scale, scale, poly);
  }
  Vectorized<float> fexp_u20() const {
-    return exp();
+    return exp_u20();
  }
-  Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF(
-      { return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
-      {
-        __at_align__ float tmp[size()];
-        __at_align__ float tmp_q[size()];
-        store(tmp);
-        q.store(tmp_q);
-        for (int64_t i = 0; i < size(); ++i) {
-          tmp[i] = std::fmod(tmp[i], tmp_q[i]);
-        }
-        return loadu(tmp);
-      })} Vectorized<float> hypot(const Vectorized<float>& b) const {
-      USE_SLEEF(
-          { return Vectorized<float>(Sleef_hypotfx_u05sve(values, b)); },
-          {
-            __at_align__ float tmp[size()];
-            __at_align__ float tmp_b[size()];
-            store(tmp);
-            b.store(tmp_b);
-            for (int64_t i = 0; i < size(); i++) {
-              tmp[i] = std::hypot(tmp[i], tmp_b[i]);
-            }
-            return loadu(tmp);
-          })} Vectorized<float> i0() const {
+  inline Vectorized<float> fmod(const Vectorized<float>& q) const {
+    return USE_SLEEF(Sleef_fmodfx_sve(*this, q), return map2(std::fmod, q));
+  }
+  inline Vectorized<float> hypot(const Vectorized<float> &b) const {
+   return USE_SLEEF(Sleef_hypotfx_u05sve(*this, b), map2(std::hypot, b));
+  }
+  inline Vectorized<float> i0() const {
    return map(calc_i0);
  }
-  Vectorized<float> i0e() const {
-    return map(calc_i0e);
+  inline Vectorized<float> i0e() const {
+    return map(calc_i0e<float>);
  }
-  Vectorized<float> digamma() const {
+  inline Vectorized<float> digamma() const {
    return map(calc_digamma);
  }
-  Vectorized<float> igamma(const Vectorized<float>& x) const {
-    __at_align__ float tmp[size()];
-    __at_align__ float tmp_x[size()];
-    store(tmp);
-    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
-      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
-    }
-    return loadu(tmp);
+  inline Vectorized<float> igamma(const Vectorized<float> &x) const {
+    return map2(calc_igamma<float>, x);
  }
-  Vectorized<float> igammac(const Vectorized<float>& x) const {
-    __at_align__ float tmp[size()];
-    __at_align__ float tmp_x[size()];
-    store(tmp);
-    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
-      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
-    }
-    return loadu(tmp);
+  inline Vectorized<float> igammac(const Vectorized<float> &x) const {
+    return map2(calc_igammac<float>, x);
  }
-  Vectorized<float> nextafter(const Vectorized<float>& b) const {USE_SLEEF(
-      { return Vectorized<float>(Sleef_nextafterfx_sve(values, b)); },
-      {
-        __at_align__ float tmp[size()];
-        __at_align__ float tmp_b[size()];
-        store(tmp);
-        b.store(tmp_b);
-        for (int64_t i = 0; i < size(); ++i) {
-          tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
-        }
-        return loadu(tmp);
-      })} Vectorized<float> log() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_logfx_u10sve(values)), map(std::log));
+  inline Vectorized<float> nextafter(const Vectorized<float> &b) const {
+    return USE_SLEEF(Sleef_nextafterfx_sve(*this, b), map2(std::nextafter, b));
  }
-  Vectorized<float> log2() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_log2fx_u10sve(values)), map(std::log2));
+  inline Vectorized<float> log() const {
+    return USE_SLEEF(Sleef_logfx_u10sve(*this), map(std::log));
  }
-  Vectorized<float> log10() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_log10fx_u10sve(values)), map(std::log10));
+  inline Vectorized<float> log2() const {
+    return USE_SLEEF(Sleef_log2fx_u10sve(*this), map(std::log2));
  }
-  Vectorized<float> log1p() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_log1pfx_u10sve(values)), map(std::log1p));
+  inline Vectorized<float> log10() const {
+    return USE_SLEEF(Sleef_log10fx_u10sve(*this), map(std::log10));
  }
-  Vectorized<float> frac() const;
-  Vectorized<float> sin() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_sinfx_u10sve(values)), map(std::sin));
+  inline Vectorized<float> log1p() const {
+    return USE_SLEEF(Sleef_log1pfx_u10sve(*this), map(std::log1p));
  }
-  Vectorized<float> sinh() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_sinhfx_u10sve(values)), map(std::sinh));
+  inline Vectorized<float> frac() const;
+  inline Vectorized<float> sin() const {
+    return USE_SLEEF(Sleef_sinfx_u10sve(*this), map(std::sin));
  }
-  Vectorized<float> cos() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_cosfx_u10sve(values)), map(std::cos));
+  inline Vectorized<float> sinh() const {
+    return USE_SLEEF(Sleef_sinhfx_u10sve(*this), map(std::sinh));
  }
-  Vectorized<float> cosh() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_coshfx_u10sve(values)), map(std::cosh));
+  inline Vectorized<float> cos() const {
+    return USE_SLEEF(Sleef_cosfx_u10sve(*this), map(std::cos));
  }
-  Vectorized<float> ceil() const {
-    return svrintp_f32_x(ptrue, values);
+  inline Vectorized<float> cosh() const {
+    return USE_SLEEF(Sleef_coshfx_u10sve(*this), map(std::cosh));
  }
-  Vectorized<float> floor() const {
-    return svrintm_f32_x(ptrue, values);
+  inline Vectorized<float> ceil() const {
+    return svrintp_f32_x(ptrue, *this);
  }
-  Vectorized<float> neg() const {
-    return svneg_f32_x(ptrue, values);
+  inline Vectorized<float> floor() const {
+    return svrintm_f32_x(ptrue, *this);
  }
-  Vectorized<float> round() const {
-    return svrinti_f32_x(ptrue, values);
+  inline Vectorized<float> neg() const {
+    return svneg_f32_x(ptrue, *this);
  }
-  Vectorized<float> tan() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_tanfx_u10sve(values)), map(std::tan));
+  inline Vectorized<float> round() const {
+    return svrinti_f32_x(ptrue, *this);
+  }
+  inline Vectorized<float> tan() const {
+    return USE_SLEEF(Sleef_tanfx_u10sve(*this), map(std::tan));
  }
  // Implementation is picked from
  // https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L179
-  Vectorized<float> tanh() const {
+  inline Vectorized<float> tanh() const {
    // Constants used for the tanh calculation.
    const svfloat32_t CONST_1 =
        svdup_n_f32(1.f); // Constant 1.0f for the tanh formula.
@ -450,7 +421,7 @@ class Vectorized<float> {
    // instability. svmax_f32_z ensures values are greater than -10, and
    // svmin_f32_z ensures they are less than 10.
    svfloat32_t x = svmin_f32_z(
-        ptrue, svmax_f32_z(ptrue, values, CONST_MIN_TANH), CONST_MAX_TANH);
+        ptrue, svmax_f32_z(ptrue, *this, CONST_MIN_TANH), CONST_MAX_TANH);

    // Step 2: Calculate exp(2 * x), where x is the clamped value.
    // svmul_f32_z computes 2 * x, and svexp_f32_z computes the exponential of
@ -472,104 +443,85 @@ class Vectorized<float> {
    // Return the calculated tanh values.
    return tanh;
  }
-  Vectorized<float> trunc() const {
-    return svrintz_f32_x(ptrue, values);
+  inline Vectorized<float> trunc() const {
+    return svrintz_f32_x(ptrue, *this);
  }
-  Vectorized<float> lgamma() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_lgammafx_u10sve(values)), map(std::lgamma));
+  inline Vectorized<float> lgamma() const {
+    return USE_SLEEF(Sleef_lgammafx_u10sve(*this), map(std::lgamma));
  }
-  Vectorized<float> sqrt() const {
-    return svsqrt_f32_x(ptrue, values);
+  inline Vectorized<float> sqrt() const {
+    return svsqrt_f32_x(ptrue, *this);
  }
-  Vectorized<float> reciprocal() const {
-    return svdivr_f32_x(ptrue, values, ONE_F32);
+  inline Vectorized<float> reciprocal() const {
+    return svdivr_f32_x(ptrue, *this, svdup_n_f32(1.f));
  }
-  Vectorized<float> rsqrt() const {
-    return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, values), ONE_F32);
+  inline Vectorized<float> rsqrt() const {
+    return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, *this), ONE_F32);
  }
-  Vectorized<float> pow(const Vectorized<float>& b) const {USE_SLEEF(
-      { return Vectorized<float>(Sleef_powfx_u10sve(values, b)); },
-      {
-        __at_align__ float tmp[size()];
-        __at_align__ float tmp_b[size()];
-        store(tmp);
-        b.store(tmp_b);
-        for (int64_t i = 0; i < size(); i++) {
-          tmp[i] = std::pow(tmp[i], tmp_b[i]);
-        }
-        return loadu(tmp);
-      })} // Comparison using the _CMP_**_OQ predicate.
-          //   `O`: get false if an operand is NaN
-          //   `Q`: do not raise if an operand is NaN
-  Vectorized<float> operator==(const Vectorized<float>& other) const {
-    svbool_t mask = svcmpeq_f32(ptrue, values, other);
+  inline Vectorized<float> pow(const Vectorized<float> &b) const {
+    return USE_SLEEF(Sleef_powfx_u10sve(*this, b), map(std::pow, b));
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  inline Vectorized<float> operator==(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpeq_f32(ptrue, *this, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+  inline Vectorized<float> operator!=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpne_f32(ptrue, *this, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+  inline Vectorized<float> operator<(const Vectorized<float>& other) const {
+    svbool_t mask = svcmplt_f32(ptrue, *this, other);
    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
  }

-  Vectorized<float> operator!=(const Vectorized<float>& other) const {
-    svbool_t mask = svcmpne_f32(ptrue, values, other);
+  inline Vectorized<float> operator<=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmple_f32(ptrue, *this, other);
    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
  }

-  Vectorized<float> operator<(const Vectorized<float>& other) const {
-    svbool_t mask = svcmplt_f32(ptrue, values, other);
+  inline Vectorized<float> operator>(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpgt_f32(ptrue, *this, other);
    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
  }

-  Vectorized<float> operator<=(const Vectorized<float>& other) const {
-    svbool_t mask = svcmple_f32(ptrue, values, other);
+  inline Vectorized<float> operator>=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpge_f32(ptrue, *this, other);
    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
  }

-  Vectorized<float> operator>(const Vectorized<float>& other) const {
-    svbool_t mask = svcmpgt_f32(ptrue, values, other);
-    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
-  }
-
-  Vectorized<float> operator>=(const Vectorized<float>& other) const {
-    svbool_t mask = svcmpge_f32(ptrue, values, other);
-    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
-  }
-
-  Vectorized<float> eq(const Vectorized<float>& other) const;
-  Vectorized<float> ne(const Vectorized<float>& other) const;
-  Vectorized<float> gt(const Vectorized<float>& other) const;
-  Vectorized<float> ge(const Vectorized<float>& other) const;
-  Vectorized<float> lt(const Vectorized<float>& other) const;
-  Vectorized<float> le(const Vectorized<float>& other) const;
+  inline Vectorized<float> eq(const Vectorized<float>& other) const;
+  inline Vectorized<float> ne(const Vectorized<float>& other) const;
+  inline Vectorized<float> gt(const Vectorized<float>& other) const;
+  inline Vectorized<float> ge(const Vectorized<float>& other) const;
+  inline Vectorized<float> lt(const Vectorized<float>& other) const;
+  inline Vectorized<float> le(const Vectorized<float>& other) const;
 };

 template <>
-Vectorized<float> inline operator+(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
+inline Vectorized<float> operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
  return svadd_f32_x(ptrue, a, b);
 }

 template <>
-Vectorized<float> inline operator-(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
+inline Vectorized<float> operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
  return svsub_f32_x(ptrue, a, b);
 }

 template <>
-Vectorized<float> inline operator*(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
+inline Vectorized<float> operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
  return svmul_f32_x(ptrue, a, b);
 }

 template <>
-Vectorized<float> inline operator/(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
+inline Vectorized<float> operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
  return svdiv_f32_x(ptrue, a, b);
 }

 // frac. Implement this here so we can use subtraction
-Vectorized<float> inline Vectorized<float>::frac() const {
+inline Vectorized<float> Vectorized<float>::frac() const {
  return *this - this->trunc();
 }

@ -585,115 +537,91 @@ Vectorized<float> inline maximum(
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
-Vectorized<float> inline minimum(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
+inline Vectorized<float> minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
  return svmin_f32_x(ptrue, a, b);
 }

 template <>
-Vectorized<float> inline clamp(
-    const Vectorized<float>& a,
-    const Vectorized<float>& min,
-    const Vectorized<float>& max) {
+inline Vectorized<float> clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
  return svmin_f32_x(ptrue, max, svmax_f32_x(ptrue, min, a));
 }

 template <>
-Vectorized<float> inline clamp_max(
-    const Vectorized<float>& a,
-    const Vectorized<float>& max) {
+inline Vectorized<float> clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
  return svmin_f32_x(ptrue, max, a);
 }

 template <>
-Vectorized<float> inline clamp_min(
-    const Vectorized<float>& a,
-    const Vectorized<float>& min) {
+inline Vectorized<float> clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
  return svmax_f32_x(ptrue, min, a);
 }

 template <>
-Vectorized<float> inline operator&(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
-  return svreinterpret_f32_s32(
-      svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+inline Vectorized<float> operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
 }

 template <>
-Vectorized<float> inline operator|(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
-  return svreinterpret_f32_s32(
-      svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+inline Vectorized<float> operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
 }

 template <>
-Vectorized<float> inline operator^(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b) {
-  return svreinterpret_f32_s32(
-      sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+inline Vectorized<float> operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
 }

-Vectorized<float> inline Vectorized<float>::eq(
-    const Vectorized<float>& other) const {
+inline Vectorized<float> Vectorized<float>::eq(const Vectorized<float>& other) const {
  return (*this == other) & Vectorized<float>(1.0f);
 }

-Vectorized<float> inline Vectorized<float>::ne(
-    const Vectorized<float>& other) const {
+inline Vectorized<float> Vectorized<float>::ne(const Vectorized<float>& other) const {
  return (*this != other) & Vectorized<float>(1.0f);
 }

-Vectorized<float> inline Vectorized<float>::gt(
-    const Vectorized<float>& other) const {
+inline Vectorized<float> Vectorized<float>::gt(const Vectorized<float>& other) const {
  return (*this > other) & Vectorized<float>(1.0f);
 }

-Vectorized<float> inline Vectorized<float>::ge(
-    const Vectorized<float>& other) const {
+inline Vectorized<float> Vectorized<float>::ge(const Vectorized<float>& other) const {
  return (*this >= other) & Vectorized<float>(1.0f);
 }

-Vectorized<float> inline Vectorized<float>::lt(
-    const Vectorized<float>& other) const {
+inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) const {
  return (*this < other) & Vectorized<float>(1.0f);
 }

-Vectorized<float> inline Vectorized<float>::le(
-    const Vectorized<float>& other) const {
+inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
  return (*this <= other) & Vectorized<float>(1.0f);
 }

 template <>
 inline void convert(const float* src, float* dst, int64_t n) {
-  const int64_t fraction = n % Vectorized<float>::size();
+  const int64_t fraction = n % svcntw();
 #pragma unroll
-  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+  for (int64_t i = 0; i < n - fraction; i += svcntw()) {
    svst1_f32(ptrue, dst + i, svldnt1_f32(ptrue, src + i));
  }
 #pragma unroll
-  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+  for (int64_t i = n - fraction; i < n; i += svcntw()) {
    svbool_t pg = svwhilelt_b32(i, n);
    svst1_f32(pg, dst + i, svldnt1_f32(pg, src + i));
  }
 }

 template <>
-inline void convert(const float* src, at::Half* dst, int64_t n) {
-  const int64_t fraction = n % Vectorized<float>::size();
-  svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
-  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+inline void convert(const float *src, at::Half *dst, int64_t n) {
+  const int64_t fraction = n % svcntw();
+  svbool_t pg_16 = svwhilelt_b16(0ull, svcntw());
+  svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
 #pragma unroll
-  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
-    svfloat16_t src_vec = svuzp1_f16(
-        svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16);
+  for (int64_t i = 0; i < n - fraction; i += svcntw()) {
+    svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)),
+                                    ZERO_F16);
    svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
  }
 #pragma unroll
-  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+  for (int64_t i = n - fraction; i < n; i += svcntw()) {
    pg_16 = svwhilelt_b16(i, n);
    pg_32 = svwhilelt_b32(i, n);
    svfloat16_t src_vec = svuzp1_f16(
@ -703,19 +631,18 @@ inline void convert(const float* src, at::Half* dst, int64_t n) {
 }

 template <>
-inline void convert(const at::Half* src, float* dst, int64_t n) {
-  const int64_t fraction = n % Vectorized<float>::size();
-  svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
-  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+inline void convert(const at::Half *src, float *dst, int64_t n) {
+  const int64_t fraction = n % svcntw();
+  svbool_t pg_16 = svwhilelt_b16(0ull, svcntw());
+  svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
 #pragma unroll
-  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
-    svfloat16_t src_vec = svzip1_f16(
-        svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
-        ZERO_F16);
+  for (int64_t i = 0; i < n - fraction; i += svcntw()) {
+    svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
+                                    ZERO_F16);
    svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
  }
 #pragma unroll
-  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+  for (int64_t i =  n - fraction; i < n; i += svcntw()) {
    pg_16 = svwhilelt_b16(i, n);
    pg_32 = svwhilelt_b32(i, n);
    svfloat16_t src_vec = svzip1_f16(
@ -726,20 +653,19 @@ inline void convert(const at::Half* src, float* dst, int64_t n) {
 }

 template <>
-inline void convert(const bool* src, float* dst, int64_t n) {
-  const int64_t fraction = n % Vectorized<float>::size();
-  svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<float>::size());
-  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+inline void convert(const bool *src, float *dst, int64_t n) {
+  const int64_t fraction = n % svcntw();
+  svbool_t pg_8 = svwhilelt_b8(0ull, svcntw());
+  svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
 #pragma unroll
-  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
-    svuint8_t src_vec_u8 =
-        svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+  for (int64_t i = 0; i < n - fraction; i += svcntw()) {
+    svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
    svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
    svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
    svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
  }
 #pragma unroll
-  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+  for (int64_t i = n - fraction; i < n; i += svcntw()) {
    pg_8 = svwhilelt_b8(i, n);
    pg_32 = svwhilelt_b32(i, n);
    svuint8_t src_vec_u8 =
@ -751,10 +677,7 @@ inline void convert(const bool* src, float* dst, int64_t n) {
 }

 template <>
-Vectorized<float> inline fmadd(
-    const Vectorized<float>& a,
-    const Vectorized<float>& b,
-    const Vectorized<float>& c) {
+inline Vectorized<float> fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
  return svmad_f32_x(ptrue, a, b, c);
 }

--- a/aten/src/ATen/cpu/vec/sve/vec_int.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_int.h
@ -15,7 +15,7 @@ namespace at::vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)

 #define VEC_INT_SVE_TEMPLATE(vl, bit)                                         \
  template <>                                                                 \
@ -49,10 +49,11 @@ inline namespace CPU_CAPABILITY {
    operator svint##bit##_t() const {                                         \
      return values;                                                          \
    }                                                                         \
-    template <uint64_t mask>                                                  \
    static Vectorized<int##bit##_t> blend(                                    \
        const Vectorized<int##bit##_t>& a,                                    \
-        const Vectorized<int##bit##_t>& b) {                                  \
+        const Vectorized<int##bit##_t>& b, \
+        uint64_t mask                      \
+      ) {                                  \
      __at_align__ int##bit##_t flag_arr[size()];                             \
      for (int i = 0; i < size(); ++i) {                                      \
        flag_arr[i] = (i < 64 && (mask & (1ULL << i))) ? 1 : 0;               \
@ -493,7 +494,7 @@ Vectorized<int8_t> inline operator>>(
  return svasr_s8_x(ptrue, a, svreinterpret_u8_s8(b));
 }

-#endif // defined(CPU_CAPABILITY_SVE)
+#endif // defined(CPU_CAPABILITY_SVE256)

 } // namespace CPU_CAPABILITY
 } // namespace at::vec
--- a/aten/src/ATen/cpu/vec/sve/vec_qint.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_qint.h
@ -46,7 +46,7 @@ namespace at::vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)

 // NOTE: These are low-performance implementations that we fall back on
 // if we are not building with SVE. This may not be an issue, because
@ -100,12 +100,12 @@ struct VectorizedQuantizedConverter {
      Vectorized<float> zero_point,
      Vectorized<float> scale_zp_premul) const {
    float_vec_return_type rv;
-    float tmp_scale[Vectorized<float>::size()];
-    float tmp_zero_point[Vectorized<float>::size()];
+    float * tmp_scale = new float[Vectorized<float>::size()];
+    float * tmp_zero_point = new float[Vectorized<float>::size()];
    scale.store(tmp_scale);
    zero_point.store(tmp_zero_point);
    for (int i = 0; i < float_num_vecs(); ++i) {
-      float tmp_vals[Vectorized<float>::size()];
+      float * tmp_vals = new float[Vectorized<float>::size()];
      for (int j = 0; j < Vectorized<float>::size(); ++j) {
        tmp_vals[j] = at::native::dequantize_val<T>(
            tmp_scale[j],
@ -113,7 +113,11 @@ struct VectorizedQuantizedConverter {
            T(vals[Vectorized<float>::size() * i + j]));
      }
      rv[i] = Vectorized<float>::loadu(tmp_vals);
+
+      delete[] tmp_vals;
    }
+    delete[] tmp_scale;
+    delete[] tmp_zero_point;
    return rv;
  }

@ -121,12 +125,12 @@ struct VectorizedQuantizedConverter {
      Vectorized<float> scale,
      Vectorized<float> zero_point) const {
    float_vec_return_type rv;
-    float tmp_scale[Vectorized<float>::size()];
-    float tmp_zero_point[Vectorized<float>::size()];
+    float * tmp_scale = new float[Vectorized<float>::size()];
+    float * tmp_zero_point = new float[Vectorized<float>::size()];
    scale.store(tmp_scale);
    zero_point.store(tmp_zero_point);
    for (int i = 0; i < float_num_vecs(); ++i) {
-      float tmp_vals[Vectorized<float>::size()];
+      float * tmp_vals = new float[Vectorized<float>::size()];
      for (int j = 0; j < Vectorized<float>::size(); ++j) {
        tmp_vals[j] = at::native::dequantize_val<T>(
            tmp_scale[j],
@ -134,7 +138,10 @@ struct VectorizedQuantizedConverter {
            T(vals[Vectorized<float>::size() * i + j]));
      }
      rv[i] = Vectorized<float>::loadu(tmp_vals);
+      delete[] tmp_vals;
    }
+    delete[] tmp_scale;
+    delete[] tmp_zero_point;
    return rv;
  }

@ -205,7 +212,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
      int32_t zero_point,
      float inverse_scale) {
    std::array<value_type, size()> qvals;
-    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+    float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];

    for (int i = 0; i < float_num_vecs(); ++i) {
      rhs[i].store(
@ -216,10 +223,11 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
    at::native::quantize_vec<c10::qint32, /*precision=*/32>(
        scale,
        zero_point,
-        float_vals.data(),
+        float_vals,
        (c10::qint32*)qvals.data(),
        Vectorized<float>::size() * float_num_vecs());

+    delete[] float_vals;
    return Vectorized<c10::qint32>::loadu(qvals.data());
  }

@ -359,7 +367,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
      int32_t zero_point,
      float inverse_scale) {
    std::array<value_type, size()> qvals;
-    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+    float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];

    for (int i = 0; i < float_num_vecs(); ++i) {
      rhs[i].store(
@ -370,10 +378,11 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
    at::native::quantize_vec<c10::qint8>(
        scale,
        zero_point,
-        float_vals.data(),
+        float_vals,
        (c10::qint8*)qvals.data(),
        Vectorized<float>::size() * float_num_vecs());

+    delete[] float_vals;
    return Vectorized<c10::qint8>::loadu(qvals.data());
  }

@ -511,7 +520,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
      int32_t zero_point,
      float inverse_scale) {
    std::array<value_type, size()> qvals;
-    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+    float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];

    for (int i = 0; i < float_num_vecs(); ++i) {
      rhs[i].store(
@ -522,10 +531,11 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
    at::native::quantize_vec<c10::quint8>(
        scale,
        zero_point,
-        float_vals.data(),
+        float_vals,
        (c10::quint8*)qvals.data(),
        Vectorized<float>::size() * float_num_vecs());

+    delete[] float_vals;
    return Vectorized<c10::quint8>::loadu(qvals.data());
  }

@ -600,7 +610,7 @@ Vectorized<c10::quint8> inline maximum(
  return a.maximum(b);
 }

-#endif // defined(CPU_CAPABILITY_SVE)
+#endif // defined(CPU_CAPABILITY_SVE256)

 } // namespace CPU_CAPABILITY
 } // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec128/vec128.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128.h
@ -4,7 +4,9 @@
 #include <ATen/cpu/vec/intrinsics.h>

 #ifdef __aarch64__
-#if !defined(CPU_CAPABILITY_SVE)
+#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
+#include <ATen/cpu/vec/sve/vec_common_sve.h>
+#else
 #include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_float_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_half_neon.h>
--- a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
@ -241,7 +241,7 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
  Vectorized() = default;

  Vectorized(c10::BFloat16 val)
-      : Vectorized16(at_vdupq_n_bf16(c10::bit_cast<at_bfloat16_t>(val.x))) {}
+      : Vectorized16(at_vdupq_n_bf16(val.x)) {}
  Vectorized(float val) : Vectorized(c10::BFloat16(val)) {}
  Vectorized(
      value_type val0,
@ -253,14 +253,14 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
      value_type val6,
      value_type val7)
      : Vectorized16(at_bfloat16x8_t{
-            c10::bit_cast<at_bfloat16_t>(val0.x),
-            c10::bit_cast<at_bfloat16_t>(val1.x),
-            c10::bit_cast<at_bfloat16_t>(val2.x),
-            c10::bit_cast<at_bfloat16_t>(val3.x),
-            c10::bit_cast<at_bfloat16_t>(val4.x),
-            c10::bit_cast<at_bfloat16_t>(val5.x),
-            c10::bit_cast<at_bfloat16_t>(val6.x),
-            c10::bit_cast<at_bfloat16_t>(val7.x)}) {}
+            val0.x,
+            val1.x,
+            val2.x,
+            val3.x,
+            val4.x,
+            val5.x,
+            val6.x,
+            val7.x}) {}

  static Vectorized<c10::BFloat16> blendv(
      const Vectorized<c10::BFloat16>& a,
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@ -4,7 +4,7 @@

 namespace at::vec {
 inline namespace CPU_CAPABILITY {
-#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
+#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE))
 template <typename src_t>
 struct VecConvert<
    float,
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@ -41,32 +41,16 @@ inline namespace CPU_CAPABILITY {
 #define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
 #endif

-template <int index, bool mask_val>
+template <int index>
 struct BlendRegs {
  static float32x4_t impl(
      const float32x4_t& a,
      const float32x4_t& b,
-      float32x4_t& res);
-};
-
-template <int index>
-struct BlendRegs<index, true> {
-  static float32x4_t impl(
-      const float32x4_t& a,
-      const float32x4_t& b,
-      float32x4_t& res) {
-    return vsetq_lane_f32(vgetq_lane_f32(b, index), res, index);
-  }
-};
-
-template <int index>
-struct BlendRegs<index, false> {
-  static float32x4_t impl(
-      const float32x4_t& a,
-      const float32x4_t& b,
-      float32x4_t& res) {
-    return vsetq_lane_f32(vgetq_lane_f32(a, index), res, index);
-  }
+      float32x4_t& res,
+      bool mask_val
+    ) {
+      return vsetq_lane_f32(vgetq_lane_f32(mask_val ? b : a, index), res, index);
+    }
 };

 template <>
@ -94,19 +78,15 @@ class Vectorized<float> {
  operator float32x4_t() const {
    return values;
  }
-  template <int64_t mask>
  static Vectorized<float> blend(
      const Vectorized<float>& a,
-      const Vectorized<float>& b) {
+      const Vectorized<float>& b,
+      int64_t mask) {
    Vectorized<float> vec;
-    vec.values = BlendRegs < 0,
-    (mask & 0x01) != 0 > ::impl(a.values, b.values, vec.values);
-    vec.values = BlendRegs < 1,
-    (mask & 0x02) != 0 > ::impl(a.values, b.values, vec.values);
-    vec.values = BlendRegs < 2,
-    (mask & 0x04) != 0 > ::impl(a.values, b.values, vec.values);
-    vec.values = BlendRegs < 3,
-    (mask & 0x08) != 0 > ::impl(a.values, b.values, vec.values);
+    vec.values = BlendRegs <0>::impl(a.values, b.values, vec.values, (mask & 0x01) != 0);
+    vec.values = BlendRegs <1> ::impl(a.values, b.values, vec.values, (mask & 0x02) != 0);
+    vec.values = BlendRegs <2> ::impl(a.values, b.values, vec.values, (mask & 0x04) != 0);
+    vec.values = BlendRegs <3> ::impl(a.values, b.values, vec.values, (mask & 0x08) != 0);
    return vec;
  }
  static Vectorized<float> blendv(
@ -307,11 +287,48 @@ class Vectorized<float> {
  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp)
  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp2)
  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
+   // Implementation copied from Arm Optimized Routine https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
  Vectorized<float> exp_u20() const {
-    return exp();
+    // bail out to sleef if it's a special case:
+    // i.e. there's an input s.t. |input| > 87.3....
+    const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
+    uint32x4_t cmp = vcagtq_f32 (values, special_bound);
+    if (vpaddd_u64 (vreinterpretq_u64_u32 (cmp)) != 0) {
+      return exp();
+    }
+
+    const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f);
+    const float ln2_hi = 0x1.62e4p-1f; 
+    const float ln2_lo = 0x1.7f7d1cp-20f; 
+    const float c0 = 0x1.0e4020p-7f; 
+    const float c2 = 0x1.555e66p-3f;
+    const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2};
+
+    const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000);
+    const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f);
+    const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f);
+    const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f);
+
+    /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+    
+    float32x4_t n = vrndaq_f32 (vmulq_f32 (values, inv_ln2));
+    float32x4_t r = vfmsq_laneq_f32 (values, n, ln2_c02, 0);
+    r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+    uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+    float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, exponent_bias));
+
+    float32x4_t r2 = vmulq_f32 (r, r);
+    float32x4_t p = vfmaq_laneq_f32 (c1, r, ln2_c02, 2);
+    float32x4_t q = vfmaq_laneq_f32 (c3, r, ln2_c02, 3);
+    q = vfmaq_f32 (q, p, r2);
+    p = vmulq_f32 (c4, r);
+    float32x4_t poly = vfmaq_f32 (p, q, r2);
+  
+    return vfmaq_f32 (scale, poly, scale);
  }
  Vectorized<float> fexp_u20() const {
-    return exp();
+    return exp_u20();
  }
  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
      fmod,
--- a/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h
@ -813,11 +813,12 @@ static inline Vectorized<T> binary_op_as_fp32(
 #define LOAD_FP32_NON_VECTORIZED_INIT(type, name)                           \
  inline void load_fp32_from_##name(                                        \
      const type* data, Vectorized<float>& out) {                           \
-    __at_align__ float values[Vectorized<float>::size()];                   \
+    __at_align__ float * values = new float[Vectorized<float>::size()];                   \
    for (const auto k : c10::irange(Vectorized<float>::size())) {           \
      values[k] = data[k];                                                  \
    }                                                                       \
    out = Vectorized<float>::loadu(values);                                 \
+    delete[] values; \
  }                                                                         \
                                                                            \
  inline void load_fp32_from_##name(                                        \
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@ -269,12 +269,13 @@ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)
 #else // defined(CPU_CAPABILITY_AVX2)

 #if !(                                                                      \
-    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    !defined(CPU_CAPABILITY_SVE256))
+    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__))
 CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
 #endif

+#if !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
 LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
+#endif
 #endif // defined(CPU_CAPABILITY_AVX2)
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@ -294,7 +294,7 @@ struct VecConvert<
 };
 #endif

-#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
+#if (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)) && defined(__ARM_FEATURE_BF16)

 template <>
 struct VecConvert<float, 1, BFloat16, 1> {
--- a/aten/src/ATen/cpu/vec/vec256/vec256_half.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_half.h
@ -270,7 +270,7 @@ LOAD_FP32_VECTORIZED_INIT(Half, fp16)

 #if !(                                                                      \
    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    !defined(CPU_CAPABILITY_SVE256))
+    !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE))
 CONVERT_NON_VECTORIZED_INIT(Half, half)
 #endif

--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@ -915,7 +915,7 @@ Vectorized<c10::quint8> inline maximum(
  return a.maximum(b);
 }

-#elif !defined(CPU_CAPABILITY_SVE256)
+#elif !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)

 // NOTE: These are low-performance implementations that we fall back on
 // if we are not building with AVX2. This may not be an issue, because
@ -1374,11 +1374,11 @@ Vectorized<c10::quint8> inline maximum(

 #endif // if defined(CPU_CAPABILITY_AVX2)

-#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
-std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
-    at::vec::Vectorized<int8_t> src) {
-  auto s8x8 = vld1_s8(src.operator const int8_t*());
-  auto s16x8 = vmovl_s8(s8x8);
+#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
+std::pair<Vectorized<float>, Vectorized<float>>
+inline convert_int8_to_float(at::vec::Vectorized<int8_t> src) {
+    auto s8x8 = vld1_s8(src.operator const int8_t*());
+    auto s16x8 = vmovl_s8(s8x8);

  auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
  auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
@ -292,8 +292,7 @@ class Vectorized16 {
      _mm512_mask_storeu_epi16(ptr, mask, values);
    }
  }
-  template <int64_t mask>
-  static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+  static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b, int64_t mask) {
    return _mm512_mask_blend_epi16(mask, a.values, b.values);
  }
  static Vectorized<T> blendv(
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
@ -69,10 +69,10 @@ class Vectorized<c10::complex<double>> {
  operator __m512d() const {
    return values;
  }
-  template <int64_t mask>
  static Vectorized<c10::complex<double>> blend(
      const Vectorized<c10::complex<double>>& a,
-      const Vectorized<c10::complex<double>>& b) {
+      const Vectorized<c10::complex<double>>& b,
+      int64_t mask) {
    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
    // NOLINTNEXTLINE(clang-diagnostic-warning)
    switch (mask) {
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
@ -89,10 +89,10 @@ class Vectorized<c10::complex<float>> {
  operator __m512() const {
    return values;
  }
-  template <int64_t mask>
  static Vectorized<c10::complex<float>> blend(
      const Vectorized<c10::complex<float>>& a,
-      const Vectorized<c10::complex<float>>& b) {
+      const Vectorized<c10::complex<float>>& b,
+      int64_t mask) {
    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
    static_assert(mask > -1 && mask < 256, "Unexpected mask value");
    // The compiler would hopefully convert this switch condition
--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
@ -55,10 +55,10 @@ class Vectorized<double> {
  operator __m512d() const {
    return values;
  }
-  template <int64_t mask>
  static Vectorized<double> blend(
      const Vectorized<double>& a,
-      const Vectorized<double>& b) {
+      const Vectorized<double>& b,
+      int64_t mask) {
    return _mm512_mask_blend_pd(mask, a.values, b.values);
  }
  static Vectorized<double> blendv(
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@ -95,10 +95,10 @@ class Vectorized<float> {
  operator __m512() const {
    return values;
  }
-  template <int64_t mask>
  static Vectorized<float> blend(
      const Vectorized<float>& a,
-      const Vectorized<float>& b) {
+      const Vectorized<float>& b,
+      int64_t mask) {
    return _mm512_mask_blend_ps(mask, a.values, b.values);
  }
  static Vectorized<float> blendv(
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@ -528,10 +528,10 @@ class Vectorized<int16_t> : public Vectorizedi {
        val2,
        val1);
  }
-  template <int64_t mask>
  static Vectorized<int16_t> blend(
      Vectorized<int16_t> a,
-      Vectorized<int16_t> b) {
+      Vectorized<int16_t> b,
+      int64_t mask) {
    return _mm512_mask_blend_epi16(mask, a.values, b.values);
  }
  static Vectorized<int16_t> blendv(
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -68,7 +68,7 @@ Windows llvm will not have this definition.
 #define VECTOR_WIDTH 64
 #define int_vector __m512i
 #elif defined(__aarch64__) && \
-    !defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
+    !defined(CPU_CAPABILITY_SVE) && !defined(CPU_CAPABILITY_SVE256) // CPU_CAPABILITY_AVX512
 // SVE code expects 256-vectors; leave that set for SVE?
 #if defined(__GNUC__)
 #define __at_align__ __attribute__((aligned(16)))
@ -79,6 +79,18 @@ Windows llvm will not have this definition.
 #endif
 #define VECTOR_WIDTH 16
 #else // CPU_CAPABILITY_AVX512
+#if defined(CPU_CAPABILITY_SVE)
+#if defined(__GNUC__)
+#define __at_align__ __attribute__((aligned(16)))
+#elif defined(_WIN32)
+#define __at_align__ __declspec(align(16))
+#else
+#define __at_align__
+#endif
+#define VECTOR_WIDTH 16
+#define int_vector __m256i
+#else // CPU_CAPABILITY_SVE256 || CPU_CAPABILITY_SVE
+#if defined(CPU_CAPABILITY_SVE256)
 #if defined(__GNUC__)
 #define __at_align__ __attribute__((aligned(32)))
 #elif defined(_WIN32)
@ -88,6 +100,18 @@ Windows llvm will not have this definition.
 #endif
 #define VECTOR_WIDTH 32
 #define int_vector __m256i
+#else // CPU_CAPABILITY_SVE
+#if defined(__GNUC__)
+#define __at_align__ __attribute__((aligned(16)))
+#elif defined(_WIN32)
+#define __at_align__ __declspec(align(16))
+#else
+#define __at_align__
+#endif
+#define VECTOR_WIDTH 16
+#define int_vector __m256i
+#endif // CPU_CAPABILITY_SVE256
+#endif // CPU_CAPABILITY_SVE256 || CPU_CAPABILITY_SVE
 #endif // CPU_CAPABILITY_AVX512

 namespace at::vec {
@ -210,8 +234,7 @@ struct Vectorized {
  auto as_bytes() const -> const char* {
    return reinterpret_cast<const char*>(values);
  }
-  template <int64_t mask_>
-  static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+  static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b, const int64_t mask_) {
    int64_t mask = mask_;
    Vectorized vector;
    for (const auto i : c10::irange(size())) {
@ -1312,7 +1335,7 @@ std::
        T const* base_addr,
        const Vectorized<int_same_size_t<T>>& vindex,
        Vectorized<T>& mask) {
-  static constexpr int size = Vectorized<T>::size();
+  static const int size = Vectorized<T>::size();
  T src_arr[size];
  int_same_size_t<T> mask_arr[size]; // use int type so we can logical and
  int_same_size_t<T> index_arr[size];
@ -1405,7 +1428,7 @@ inline Vectorized<T> convert_to_fp_of_same_size(
 // clang-format on
 template <typename T>
 inline std::enable_if_t<
-    Vectorized<T>::size() % 2 == 0,
+    true,
    std::pair<Vectorized<T>, Vectorized<T>>>
 deinterleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
  static constexpr int size = Vectorized<T>::size();
@ -1444,7 +1467,7 @@ VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(deinterleave2)
 // clang-format on
 template <typename T>
 inline std::enable_if_t<
-    Vectorized<T>::size() % 2 == 0,
+    true,
    std::pair<Vectorized<T>, Vectorized<T>>>
 interleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
  static constexpr int size = Vectorized<T>::size();
@ -1486,7 +1509,7 @@ inline void convert(const src_T* src, dst_T* dst, int64_t n) {

 template <typename T>
 inline Vectorized<T> flip(const Vectorized<T>& data) {
-  static constexpr int size = Vectorized<T>::size();
+  static const int size = Vectorized<T>::size();
  T output[size];
  T buffer[size];
  data.store(static_cast<void*>(buffer));
--- a/aten/src/ATen/cpu/vec/vec_convert.h
+++ b/aten/src/ATen/cpu/vec/vec_convert.h
@ -15,7 +15,7 @@ template <
 struct VecConvert {
  static inline VectorizedN<dst_t, dst_n> apply(
      const VectorizedN<src_t, src_n>& src) {
-    constexpr int count = std::min(
+    const int count = std::min(
        VectorizedN<src_t, src_n>::size(), VectorizedN<dst_t, dst_n>::size());
    __at_align__ src_t src_buf[VectorizedN<src_t, src_n>::size()];
    src.store(src_buf);
--- a/aten/src/ATen/cpu/vec/vec_mask.h
+++ b/aten/src/ATen/cpu/vec/vec_mask.h
@ -2,6 +2,8 @@

 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec_n.h>
+
+#include <cassert>
 namespace at::vec {
 inline namespace CPU_CAPABILITY {

@ -38,9 +40,9 @@ struct VecMaskLoad {
  static inline VectorizedN<data_t, data_n> apply(
      const data_t* ptr,
      const VecMask<mask_t, mask_n>& vec_mask) {
-    constexpr typename VecMask<mask_t, mask_n>::size_type size =
+    const typename VecMask<mask_t, mask_n>::size_type size =
        VecMask<mask_t, mask_n>::size();
-    static_assert(VectorizedN<data_t, data_n>::size() >= size);
+    assert((VectorizedN<data_t, data_n>::size() >= size));
    __at_align__ data_t data[size];
    __at_align__ mask_t mask[size];
    auto mask_ = VectorizedN<mask_t, mask_n>(vec_mask);
@ -134,7 +136,7 @@ class VecMask {
  template <typename U, int L>
  static VecMask<T, N> from(const VectorizedN<U, L>& b_vec) {
    __at_align__ U b_buf[size()];
-    if constexpr (size() >= VectorizedN<U, L>::size()) {
+    if (size() >= VectorizedN<U, L>::size()) {
      b_vec.store(b_buf);
      for (int i = VectorizedN<U, L>::size(); i < size(); i++) {
        b_buf[i] = static_cast<U>(0);
@ -235,16 +237,18 @@ class VecMask {
  template <
      typename U,
      int L,
-      std::enable_if_t<L >= 2 && VectorizedN<U, L>::size() >= size(), int> = 0>
+      std::enable_if_t<L >= 2, int> = 0>
  VectorizedN<U, L> loadu(const U* ptr) const {
+    assert((VectorizedN<U, L>::size() >= size()));
    return VecMaskLoad<U, L, T, N>::apply(ptr, *this);
  }

  template <
      typename U,
      int L,
-      std::enable_if_t<L == 1 && Vectorized<U>::size() >= size(), int> = 0>
+      std::enable_if_t<L == 1, int> = 0>
  Vectorized<U> loadu(const U* ptr) const {
+    assert((Vectorized<U>::size() >= size()));
    return VecMaskLoad<U, L, T, N>::apply(ptr, *this);
  }
 };
--- a/aten/src/ATen/cpu/vec/vec_n.h
+++ b/aten/src/ATen/cpu/vec/vec_n.h
@ -28,7 +28,7 @@ class VectorizedN {
  using size_type = int;

  static constexpr size_type size_T = sizeof(T);
-  static constexpr size_type size() {
+  static size_type size() {
    return Vectorized<T>::size() * N;
  }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ryo Suzuki	8ac81dba21	add vec128 vecreduce	2025-09-23 17:16:40 +00:00
Ryo Suzuki	dae9a71d99	fix ci issues	2025-09-12 09:32:19 +00:00
Ryo Suzuki	bf4b0e8c41	fix double free	2025-09-12 09:32:19 +00:00
Ryo Suzuki	0384f48daa	Fix compile	2025-09-12 09:32:18 +00:00
Ryo Suzuki	3b92a1adfe	Fix tests	2025-09-12 09:30:57 +00:00
Ryo Suzuki	6ca9dc026d	add SVE dispatch	2025-09-12 09:30:57 +00:00
Ryo Suzuki	a499828924	Make size non-constexpr	2025-09-12 09:30:57 +00:00
Ryo Suzuki	e84eabd4f9	Vec length agnostic SVE Vectorized class POC	2025-09-12 09:30:57 +00:00
Analle Abuammar	5e53e458b9	[feat]: add optimized exp_u20 implementation from Arm Optimized Routines (AOR) Co-authored-by: Fadi Arafeh <Fadi.Arafeh@arm.com> Signed-off-by: Analle Abuammar <analle.abuammar@arm.com>	2025-09-12 09:30:57 +00:00