Don't uselessly recompute axiom dict every static eval call (#135429 )

Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/135429 Approved by: https://github.com/isuruf ghstack dependencies: #135137
Deal with size oblivious before going into worker (#135137 )
2025-11-06 17:24:59 +08:00 · 2024-09-27 04:03:25 +00:00 · 2024-09-27 04:03:25 +00:00 · 2024-09-27 04:01:40 +00:00 · 2024-09-27 04:01:09 +00:00 · 2024-09-27 03:58:00 +00:00
956 changed files with 18209 additions and 23895 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -286,18 +286,7 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-focal-rocm-n-1-py3)
-    ANACONDA_PYTHON_VERSION=3.8
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    ROCM_VERSION=6.0
-    NINJA_VERSION=1.9.0
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-rocm-n-py3)
-    ANACONDA_PYTHON_VERSION=3.8
+    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
@ -307,6 +296,17 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
+  pytorch-linux-focal-rocm-n-py3)
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    ROCM_VERSION=6.2
+    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
  pytorch-linux-jammy-xpu-2024.0-py3)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
@ -379,6 +379,7 @@ case "$image" in
    GCC_VERSION=11
    CONDA_CMAKE=yes
    HALIDE=yes
+    TRITON=yes
    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -7,7 +7,7 @@ PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/hea
 GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py

 # Python versions to be installed in /opt/$VERSION_NO
-CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0"}
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}

 function check_var {
    if [ -z "$1" ]; then
@ -22,6 +22,13 @@ function do_cpython_build {
    check_var $py_ver
    check_var $py_folder
    tar -xzf Python-$py_ver.tgz
+
+    local additional_flags=""
+    if [ "$py_ver" == "3.13.0t" ]; then
+        additional_flags=" --disable-gil"
+        mv cpython-3.13/ cpython-3.13t/
+    fi
+
    pushd $py_folder

    local prefix="/opt/_internal/cpython-${py_ver}"
@ -37,8 +44,10 @@ function do_cpython_build {
        local openssl_flags="--with-openssl=${WITH_OPENSSL} --with-openssl-rpath=auto"
    fi

+
+
    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
-    CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} > /dev/null
+    CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} ${additional_flags} > /dev/null

    make -j40 > /dev/null
    make install > /dev/null
@ -69,7 +78,14 @@ function build_cpython {
    check_var $py_ver
    check_var $PYTHON_DOWNLOAD_URL
    local py_ver_folder=$py_ver
-    if [ "$py_ver" = "3.13.0" ]; then
+
+    if [ "$py_ver" = "3.13.0t" ]; then
+        PY_VER_SHORT="3.13"
+        PYT_VER_SHORT="3.13t"
+        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
+        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
+        do_cpython_build $py_ver cpython-$PYT_VER_SHORT
+    elif [ "$py_ver" = "3.13.0" ]; then
        PY_VER_SHORT="3.13"
        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,7 +5,7 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-4]$ ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-6]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
--- a/.ci/docker/common/install_miopen.sh
+++ b/.ci/docker/common/install_miopen.sh
@ -10,6 +10,21 @@ if [[ -z $ROCM_VERSION ]]; then
    exit 1;
 fi

+IS_UBUNTU=0
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    IS_UBUNTU=1
+    ;;
+  centos)
+    IS_UBUNTU=0
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
+
 # To make version comparison easier, create an integer representation.
 save_IFS="$IFS"
 IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION})
@ -57,9 +72,11 @@ MIOPEN_CMAKE_COMMON_FLAGS="
 -DMIOPEN_BUILD_DRIVER=OFF
 "
 # Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
-if [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
-    echo "ROCm 6.2 MIOpen does not need any patches, do not build from source"
+if [[ $ROCM_INT -ge 60300 ]]; then
+    echo "ROCm 6.3+ MIOpen does not need any patches, do not build from source"
    exit 0
+elif [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
+    MIOPEN_BRANCH="release/rocm-rel-6.2-staging"
 elif [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
    echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
    exit 0
@ -93,12 +110,21 @@ else
    exit 1
 fi

-yum remove -y miopen-hip
+
+if [[ ${IS_UBUNTU} == 1 ]]; then
+  apt-get remove -y miopen-hip
+else
+  yum remove -y miopen-hip
+fi

 git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
 pushd MIOpen
 # remove .git to save disk space since CI runner was running out
 rm -rf .git
+# Don't build CK to save docker build time
+if [[ $ROCM_INT -ge 60200 ]]; then
+    sed -i '/composable_kernel/d' requirements.txt
+fi
 # Don't build MLIR to save docker build time
 # since we are disabling MLIR backend for MIOpen anyway
 if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
@ -111,10 +137,15 @@ cmake -P install_deps.cmake --minimum

 # clean up since CI runner was running out of disk space
 rm -rf /tmp/*
-yum clean all
-rm -rf /var/cache/yum
-rm -rf /var/lib/yum/yumdb
-rm -rf /var/lib/yum/history
+if [[ ${IS_UBUNTU} == 1 ]]; then
+  apt-get autoclean && apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+else
+  yum clean all
+  rm -rf /var/cache/yum
+  rm -rf /var/lib/yum/yumdb
+  rm -rf /var/lib/yum/history
+fi

 ## Build MIOpen
 mkdir -p build
@ -131,7 +162,11 @@ make -j $(nproc) package
 # clean up since CI runner was running out of disk space
 rm -rf /usr/local/cget

-yum install -y miopen-*.rpm
+if [[ ${IS_UBUNTU} == 1 ]]; then
+  sudo dpkg -i miopen-hip*.deb
+else
+  yum install -y miopen-*.rpm
+fi

 popd
 rm -rf MIOpen
--- a/.ci/docker/conda/build.sh
+++ b/.ci/docker/conda/build.sh
@ -37,6 +37,12 @@ esac

 (
  set -x
+  # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+  # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+  sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+  sudo systemctl daemon-reload
+  sudo systemctl restart docker
+
  docker build \
    --target final \
    --progress plain \
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -10,6 +10,7 @@ ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8

 ARG DEVTOOLSET_VERSION=9
+
 # Note: This is required patch since CentOS have reached EOL
 # otherwise any yum install setp will fail
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -124,7 +124,14 @@ if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
 fi
 (
    set -x
-    DOCKER_BUILDKIT=1 docker build \
+
+    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+    sudo systemctl daemon-reload
+    sudo systemctl restart docker
+
+    DOCKER_BUILDKIT=1 docker build  \
        ${DOCKER_GPU_BUILD_ARG} \
        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
        --target "${TARGET}" \
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -90,7 +90,7 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.10.0
+mypy==1.11.2
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
 #Pinned versions: 1.10.0
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -68,6 +68,8 @@ RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
 ENV ROCM_PATH /opt/rocm
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
@ -121,5 +123,8 @@ RUN bash ./install_cache.sh && rm install_cache.sh
 ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}

+# Install LLVM dev version (Defined in the pytorch/builder github repository)
+COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
+
 USER jenkins
 CMD ["bash"]
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -49,13 +49,8 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
 fi

 # Enable LLVM dependency for TensorExpr testing
-if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-  export USE_LLVM=/opt/rocm/llvm
-  export LLVM_DIR=/opt/rocm/llvm/lib/cmake/llvm
-else
-  export USE_LLVM=/opt/llvm
-  export LLVM_DIR=/opt/llvm/lib/cmake/llvm
-fi
+export USE_LLVM=/opt/llvm
+export LLVM_DIR=/opt/llvm/lib/cmake/llvm

 if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
  # To build test_edge_op_registration
@ -237,7 +232,7 @@ fi

 # Do not change workspace permissions for ROCm CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* ]]; then
  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
  cleanup_workspace() {
@ -283,6 +278,7 @@ else
    # set only when building other architectures
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
+          "$BUILD_ENVIRONMENT" != *s390x*   &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
        # Install numpy-2.0.2 for builds which are backward compatible with 1.X
@ -345,11 +341,11 @@ else
    CUSTOM_OP_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-op-build"
    CUSTOM_OP_TEST="$PWD/test/custom_operator"
    python --version
-    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+    SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"

    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -359,10 +355,10 @@ else
    JIT_HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
    JIT_HOOK_TEST="$PWD/test/jit_hooks"
    python --version
-    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+    SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"
    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -374,7 +370,7 @@ else
    python --version
    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -407,6 +403,6 @@ fi

 # snadampal: skipping it till sccache support added for aarch64
 # https://github.com/pytorch/pytorch/issues/121559
-if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *aarch64* &&  "$BUILD_ENVIRONMENT" != *s390x* ]]; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/create_test_cert.py
+++ b/.ci/pytorch/create_test_cert.py
@ -1,4 +1,4 @@
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from tempfile import mkdtemp

 from cryptography import x509
@ -42,10 +42,10 @@ def create_cert(path, C, ST, L, O, key):
        .issuer_name(issuer)
        .public_key(key.public_key())
        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.utcnow())
+        .not_valid_before(datetime.now(timezone.utc))
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.utcnow()
+            datetime.now(timezone.utc)
            + timedelta(days=10)
        )
        .add_extension(
@ -88,10 +88,10 @@ def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
        .issuer_name(ca_cert.subject)
        .public_key(csr_cert.public_key())
        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.utcnow())
+        .not_valid_before(datetime.now(timezone.utc))
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.utcnow()
+            datetime.now(timezone.utc)
            + timedelta(days=10)
            # Sign our certificate with our private key
        )
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -375,9 +375,8 @@ test_inductor_cpp_wrapper_abi_compatible() {
  mkdir -p "$TEST_REPORTS_DIR"

  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
-  # cpu stack allocation causes segfault and needs more investigation
  PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
-  python test/run_test.py --include inductor/test_cuda_cpp_wrapper
+  python test/run_test.py --include inductor/test_cuda_cpp_wrapper inductor/test_cpu_repro

  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
@ -401,9 +400,9 @@ pr_time_benchmarks() {

  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
-  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_after.txt" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
+  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
  echo "benchmark results on current PR: "
-  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_after.txt"
+  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"

 }

@ -1383,14 +1382,16 @@ test_executorch() {
  assert_git_not_dirty
 }

-test_linux_aarch64(){
+test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
-       test_transformers test_multiprocessing test_numpy_interop --verbose
+        test_transformers test_multiprocessing test_numpy_interop \
+        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Dynamo tests
  python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
       dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
-       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles --verbose
+       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles \
+       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Inductor tests
  python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
@ -1400,7 +1401,8 @@ test_linux_aarch64(){
       inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
       inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
-       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes --verbose
+       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes inductor/test_memory \
+       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
 }

 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -32,30 +32,6 @@ self-hosted-runner:
    - lf.linux.8xlarge.nvidia.gpu
    - lf.linux.16xlarge.nvidia.gpu
    - lf.linux.g5.4xlarge.nvidia.gpu
-    # Organization-wide AWS Linux Runners with new Amazon 2023 AMI
-    - amz2023.linux.large
-    - amz2023.linux.2xlarge
-    - amz2023.linux.4xlarge
-    - amz2023.linux.12xlarge
-    - amz2023.linux.24xlarge
-    - amz2023.linux.arm64.2xlarge
-    - amz2023.linux.arm64.m7g.4xlarge
-    - amz2023.linux.arm64.m7g.4xlarge.ephemeral
-    - amz2023.linux.4xlarge.nvidia.gpu
-    - amz2023.linux.8xlarge.nvidia.gpu
-    - amz2023.linux.16xlarge.nvidia.gpu
-    - amz2023.linux.g5.4xlarge.nvidia.gpu
-    # Pytorch/pytorch AWS Linux Runners with the new Amazon 2023 AMI on Linux Foundation account
-    - amz2023.lf.linux.large
-    - amz2023.lf.linux.2xlarge
-    - amz2023.lf.linux.4xlarge
-    - amz2023.lf.linux.12xlarge
-    - amz2023.lf.linux.24xlarge
-    - amz2023.lf.linux.arm64.2xlarge
-    - amz2023.lf.linux.4xlarge.nvidia.gpu
-    - amz2023.lf.linux.8xlarge.nvidia.gpu
-    - amz2023.lf.linux.16xlarge.nvidia.gpu
-    - amz2023.lf.linux.g5.4xlarge.nvidia.gpu
    # Repo-specific IBM hosted S390x runner
    - linux.s390x
    # Organization wide AWS Windows runners
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-97ed7b36b7a741253d4e41e4da3c901d83294503
+ba696ea3dfec4cbe693bf06a84c75dc196077f5b
--- a/.github/lf-canary-scale-config.yml
+++ b/.github/lf-canary-scale-config.yml
@ -35,38 +35,35 @@ runner_types:
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.10xlarge.avx2:
    disk_size: 200
    instance_type: m4.10xlarge
    is_ephemeral: false
    max_available: 450
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.9xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.9xlarge
    is_ephemeral: true
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
    variants:
      am2:
        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
@ -76,149 +73,140 @@ runner_types:
    is_ephemeral: true
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.16xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.24xlarge:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: false
    max_available: 500
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.24xlarge.ephemeral:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.2xlarge:
    disk_size: 150
    instance_type: c5.2xlarge
    is_ephemeral: false
    max_available: 3120
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.4xlarge:
    disk_size: 150
    instance_type: c5.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.8xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.8xlarge
    is_ephemeral: false
    max_available: 400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g4dn.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
    max_available: 250
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.12xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
    max_available: 2400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g6.4xlarge.experimental.nvidia.gpu:
    disk_size: 150
    instance_type: g6.4xlarge
    is_ephemeral: false
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.large:
    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.arm64.2xlarge:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.m7g.4xlarge:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.2xlarge.ephemeral:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.m7g.4xlarge.ephemeral:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.m7g.metal:
    disk_size: 256
    instance_type: m7g.metal
    is_ephemeral: false
    max_available: 100
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.windows.g4dn.xlarge:
    disk_size: 256
    instance_type: g4dn.xlarge
--- a/.github/lf-scale-config.yml
+++ b/.github/lf-scale-config.yml
@ -35,38 +35,35 @@ runner_types:
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.10xlarge.avx2:
    disk_size: 200
    instance_type: m4.10xlarge
    is_ephemeral: false
    max_available: 450
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.9xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.9xlarge
    is_ephemeral: true
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
    variants:
      am2:
        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
@ -76,149 +73,140 @@ runner_types:
    is_ephemeral: true
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.16xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.24xlarge:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: false
    max_available: 500
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.24xlarge.ephemeral:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.2xlarge:
    disk_size: 150
    instance_type: c5.2xlarge
    is_ephemeral: false
    max_available: 3120
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.4xlarge:
    disk_size: 150
    instance_type: c5.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.8xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.8xlarge
    is_ephemeral: false
    max_available: 400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g4dn.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
    max_available: 250
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.12xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
    max_available: 2400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g6.4xlarge.experimental.nvidia.gpu:
    disk_size: 150
    instance_type: g6.4xlarge
    is_ephemeral: false
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.large:
    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.arm64.2xlarge:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.m7g.4xlarge:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.2xlarge.ephemeral:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.m7g.4xlarge.ephemeral:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.m7g.metal:
    disk_size: 256
    instance_type: m7g.metal
    is_ephemeral: false
    max_available: 100
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.windows.g4dn.xlarge:
    disk_size: 256
    instance_type: g4dn.xlarge
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -544,6 +544,7 @@
  - anijain2305
  - bdhirsh
  - zou3519
+  - isuruf
  mandatory_checks_name:
  - EasyCLA
  - Lint
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -17,6 +17,11 @@ if [[ -d "${CACHE_DIRECTORY}" ]]; then
    cp -r "${CACHE_DIRECTORY}" . || true
 fi

+# if lintrunner is not installed, install it
+if ! command -v lintrunner &> /dev/null; then
+    python3 -m pip install lintrunner==0.12.5
+fi
+
 # This has already been cached in the docker image
 lintrunner init 2> /dev/null

@ -33,7 +38,7 @@ python3 torch/utils/data/datapipes/gen_pyi.py

 RC=0
 # Run lintrunner on all files
-if ! lintrunner --force-color --all-files --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
+if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
    echo ""
    echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner -m origin/main\`. (If you don't get the same results, run \'lintrunner init\' to update your local linter)\e[0m"
    echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
--- a/.github/scripts/sync_distributed_folder_prototype.sh
+++ b/.github/scripts/sync_distributed_folder_prototype.sh
@ -1,35 +0,0 @@
-#!/bin/bash
-
-set -eoux pipefail
-
-SYNC_BRANCH=pytorch-stable-prototype
-
-git config user.email "fake@example.com"
-git config user.name  "PyTorch Stable Bot"
-
-git fetch origin main
-git fetch origin "$SYNC_BRANCH"
-git checkout "$SYNC_BRANCH"
-
-# Using a hardcoded SHA here is a massive speedup as we can skip the entire history of the pytorch GitHub repo.
-# This specific SHA was chosen as it was before the "branch point" of the stable branch
-for SHA in $(git log ba3b05fdf37ddbc3c301294d6a560a816335e717..origin/main --pretty="%h" -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
-do
-    # `git merge-base --is-ancestor` exits with code 0 if the given SHA is an ancestor, and non-0 otherwise
-    if git merge-base --is-ancestor $SHA HEAD || [[ $(git log --grep="(cherry picked from commit $SHA") ]]
-    then
-        echo "Skipping $SHA"
-        continue
-    fi
-    echo "Copying $SHA"
-    git cherry-pick -x "$SHA" -X theirs
-    git reset --soft HEAD~1
-    git add torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed
-    git checkout .
-    git commit --reuse-message=HEAD@{1}
-    git clean -f
-done
-
-if [[ "${WITH_PUSH}" == true ]]; then
-  git push
-fi
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -109,6 +109,7 @@ jobs:
    steps:
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

@ -118,13 +119,16 @@ jobs:
      # checkout. In other cases you should prefer a local checkout.
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          no-sudo: ${{ inputs.build-environment == 'linux-s390x-binary-manywheel' }}

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'

      - name: configure aws credentials
        uses: aws-actions/configure-aws-credentials@v3
-        if: ${{ inputs.aws-role-to-assume != '' }}
+        if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          role-to-assume: ${{ inputs.aws-role-to-assume }}
          role-session-name: gha-linux-build
@ -133,11 +137,13 @@ jobs:
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image-name: ${{ inputs.docker-image-name }}

      - name: Use following to pull public copy of the image
        id: print-ghcr-mirror
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        env:
          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
        shell: bash
@ -147,6 +153,7 @@ jobs:

      - name: Pull docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -174,6 +181,7 @@ jobs:
      - name: Download pytest cache
        uses: ./.github/actions/pytest-cache-download
        continue-on-error: true
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          cache_dir: .pytest_cache
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
@ -195,6 +203,7 @@ jobs:
          PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
          TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
          DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
@ -202,7 +211,21 @@ jobs:
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
          USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
        run: |
+          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
+            JENKINS_USER=
+            USED_IMAGE="${DOCKER_IMAGE_S390X}"
+
+            # since some steps are skipped on s390x, if they are necessary, run them here
+            env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+            env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+          else
+            JENKINS_USER="--user jenkins"
+            USED_IMAGE="${DOCKER_IMAGE}"
+          fi
+
          # detached container should get cleaned up by teardown_ec2_linux
+          # Used for JENKINS_USER, which can be empty
+          # shellcheck disable=SC2086
          container_name=$(docker run \
            -e BUILD_ENVIRONMENT \
            -e MAX_JOBS="$(nproc --ignore=2)" \
@ -225,10 +248,10 @@ jobs:
            --cap-add=SYS_PTRACE \
            --tty \
            --detach \
-            --user jenkins \
+            ${JENKINS_USER} \
            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
+            "${USED_IMAGE}"
          )
          docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'

@ -239,7 +262,7 @@ jobs:

      - name: Store PyTorch Build Artifacts on S3
        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
@ -249,7 +272,7 @@ jobs:

      - name: Store PyTorch Build Artifacts on S3 for split build
        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}-experimental-split-build
          retention-days: 14
@ -257,8 +280,26 @@ jobs:
          path: artifacts.zip
          s3-bucket: ${{ inputs.s3-bucket }}

+      - name: Store PyTorch Build Artifacts for s390x
+        uses: actions/upload-artifact@v3
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        with:
+          name: ${{ inputs.build-environment }}
+          retention-days: 14
+          if-no-files-found: error
+          path: artifacts.zip
+
+      - name: Store PyTorch Build Artifacts for s390x for split build
+        uses: actions/upload-artifact@v3
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        with:
+          name: ${{ inputs.build-environment }}-experimental-split-build
+          retention-days: 14
+          if-no-files-found: error
+          path: artifacts.zip
+
      - name: Upload sccache stats
-        if: steps.build.outcome != 'skipped'
+        if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
        uses: seemethere/upload-artifact-s3@v5
        with:
          s3-prefix: |
@ -270,4 +311,13 @@ jobs:

      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
-        if: always()
+        if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
+
+      - name: Cleanup docker
+        if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        shell: bash
+        run: |
+          # on s390x stop the container for clean worker stop
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@ -88,6 +88,13 @@ jobs:
          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt

+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
      - name: Install PyTorch and run MPS tests
        id: test
        env:
@ -103,6 +110,14 @@ jobs:
          NO_TEST_TIMEOUT: ${{ needs.filter.outputs.ci-no-test-timeout }}
          NO_TD: ${{ needs.filter.outputs.ci-no-td }}
          PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_WORKFLOW: ${{ github.workflow }}
+          GITHUB_JOB: ${{ github.job }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_RUN_NUMBER: ${{ github.run_number }}
+          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
          REENABLED_ISSUES: ${{ needs.filter.outputs.reenabled-issues }}
        run: |
          # shellcheck disable=SC1090
@ -144,13 +159,6 @@ jobs:
        run: |
          cat test/**/*_toprint.log || true

-      - name: Get workflow job id
-        id: get-job-id
-        uses: ./.github/actions/get-workflow-job-id
-        if: always()
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Upload test artifacts
        uses: ./.github/actions/upload-test-artifacts
        if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
--- a/.github/workflows/build-conda-images.yml
+++ b/.github/workflows/build-conda-images.yml
@ -32,7 +32,7 @@ concurrency:
 jobs:
  build-docker:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
        cuda_version: ["11.8", "12.1", "12.4", "cpu"]
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -45,7 +45,7 @@ jobs:
  build-docker-cuda:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -156,7 +156,7 @@ jobs:
  build-docker-rocm:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        rocm_version: ["6.1", "6.2"]
@ -192,7 +192,7 @@ jobs:
  build-docker-cpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@ -30,6 +30,9 @@ concurrency:

 jobs:
  check-labels:
+    permissions:
+      contents: read
+      pull-requests: write
    name: Check labels
    if: github.repository_owner == 'pytorch'
    runs-on: linux.20_04.4x
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@ -5,9 +5,7 @@ on:
    # - cron: 0 7 * * 1-6
    # - cron: 0 7 * * 0
    # Does not perform max_autotune on CPU, so skip the weekly run setup
-    # Run 6 times everyday to see if perf instablity can be reproduced
-    # Will change this back
-    - cron: 0 */4 * * *
+    - cron: 0 7 * * *
  # NB: GitHub has an upper limit of 10 inputs here
  workflow_dispatch:
    inputs:
@ -116,7 +114,7 @@ jobs:
    name: linux-jammy-aarch64-py3.10-inductor
    uses: ./.github/workflows/_linux-test.yml
    needs: linux-jammy-aarch64-py3_10-inductor-build
-    if: github.event.schedule == '0 */4 * * *'
+    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-aarch64-py3.10
      # Turn off dynamic-shapes and aotinductor tests for now, to have faster iteration for debugging perf instability.
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm.yml
@ -31,13 +31,13 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  linux-focal-rocm6_1-py3_8-inductor-build:
-    name: rocm6.1-py3.8-inductor
+  linux-focal-rocm6_2-py3_10-inductor-build:
+    name: rocm6.2-py3.10-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
@ -45,14 +45,14 @@ jobs:
          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
        ]}

-  linux-focal-rocm6_1-py3_8-inductor-test:
+  linux-focal-rocm6_2-py3_10-inductor-test:
    permissions:
      id-token: write
      contents: read
-    name: rocm6.1-py3.8-inductor
+    name: rocm6.2-py3.10-inductor
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm6_1-py3_8-inductor-build
+    needs: linux-focal-rocm6_2-py3_10-inductor-build
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.docker-image }}
-      test-matrix:  ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-inductor-build.outputs.docker-image }}
+      test-matrix:  ${{ needs.linux-focal-rocm6_2-py3_10-inductor-build.outputs.test-matrix }}
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -58,8 +58,7 @@ jobs:
          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-focal-cuda12_1-py3_10-gcc9-inductor-test:
    name: cuda12.1-py3.10-gcc9-sm86
@ -69,8 +68,7 @@ jobs:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
    name: cuda12.1-py3.12-gcc9-sm86
@ -86,6 +84,7 @@ jobs:
          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
+    secrets: inherit

  linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
    name: cuda12.1-py3.12-gcc9-sm86
@ -95,6 +94,7 @@ jobs:
      build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
      docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit

  linux-jammy-cpu-py3_12-inductor-halide-build:
    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
@ -108,6 +108,7 @@ jobs:
        { include: [
          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
        ]}
+    secrets: inherit

  linux-jammy-cpu-py3_12-inductor-halide-test:
    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
@ -117,6 +118,7 @@ jobs:
      build-environment: linux-jammy-py3.12-gcc11
      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
+    secrets: inherit

  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
    # Should be synced with the one in inductor-periodic.yml but this only runs inductor_timm
@ -134,8 +136,7 @@ jobs:
          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
    name: cuda12.4-py3.10-gcc9-sm86
@ -146,8 +147,7 @@ jobs:
      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-build:
    name: linux-jammy-cpu-py3.9-gcc11-inductor
@ -201,8 +201,7 @@ jobs:
          { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
          { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-test:
    name: linux-jammy-cpu-py3.9-gcc11-inductor
@ -212,5 +211,4 @@ jobs:
      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit
--- a/.github/workflows/lint-autoformat.yml
+++ b/.github/workflows/lint-autoformat.yml
@ -0,0 +1,44 @@
+name: Apply lint suggestions
+
+on:
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  lintrunner-autoformat:
+    permissions:
+      contents: read
+      pull-requests: write
+    runs-on: lf.linux.2xlarge
+    continue-on-error: true
+    if: ${{ github.repository_owner == 'pytorch' }}
+    steps:
+      - name: Checkout pytorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: true
+          fetch-depth: 0
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: "3.10"
+      - name: Run lintrunner (nonretryable)
+        continue-on-error: true
+        # we can't run all files here because only changes around where the diff are shown in the PR UI
+        run: |
+          export ADDITIONAL_LINTRUNNER_ARGS="format"
+          bash .github/scripts/lintrunner.sh
+      - name: Check for changes
+        id: git-check
+        run: |
+          git diff --exit-code || echo "changes=true" >> "$GITHUB_OUTPUT"
+      - name: Suggest changes
+        if: steps.git-check.outputs.changes == 'true'
+        uses: parkerbxyz/suggest-changes@v1
+        with:
+          comment: "Please commit the suggested changes from pytorch's linter."
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -36,7 +36,7 @@ jobs:
      submodules: true
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
-        export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT"
+        export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT --all-files"
        export CLANG=1
        .github/scripts/lintrunner.sh

@ -53,7 +53,7 @@ jobs:
      submodules: true
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
-        export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT"
+        export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT --all-files"
        .github/scripts/lintrunner.sh

  quick-checks:
@ -278,4 +278,4 @@ jobs:

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
+  cancel-in-progress: true
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -218,7 +218,9 @@ jobs:
  # TODO: Figure out how to migrate this job to M1 runner
  ios-build-test:
    name: ios-build-test
-    if: github.event_name != 'schedule' || github.event.schedule == '45 0,8,16 * * 1-5' || github.event.schedule == '45 4 * * 0,6' || github.event.schedule == '29 8 * * *'
+    # Has been broken for a while, see https://github.com/pytorch/pytorch/issues/136284
+    # if: github.event_name != 'schedule' || github.event.schedule == '45 0,8,16 * * 1-5' || github.event.schedule == '45 4 * * 0,6' || github.event.schedule == '29 8 * * *'
+    if: false
    uses: ./.github/workflows/_ios-build-test.yml
    with:
      trigger-event: ${{ github.event_name }}
@ -297,13 +299,13 @@ jobs:
      docker-image: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.test-matrix }}

-  linux-focal-rocm6_1-py3_8-build:
-    name: linux-focal-rocm6.1-py3.8
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
@ -312,19 +314,19 @@ jobs:
          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu" },
        ]}

-  linux-focal-rocm6_1-py3_8-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_8-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}

  linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build:
    name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -383,7 +383,7 @@ jobs:
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.1-lite
+      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
      test-matrix: |
        { include: [
          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
@ -503,15 +503,15 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm6_1-py3_8-build:
+  linux-focal-rocm6_2-py3_10-build:
    # don't run build twice on main
    if: github.event_name == 'pull_request'
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -25,11 +25,11 @@ jobs:
      id-token: write
      contents: read

-  linux-focal-rocm6_1-py3_8-build:
-    name: linux-focal-rocm6.1-py3.8
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
@ -42,16 +42,16 @@ jobs:
          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" },
        ]}

-  linux-focal-rocm6_1-py3_8-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_8-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -130,13 +130,13 @@ jobs:
      docker-image: ${{ needs.linux-focal-py3_9-clang10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-py3_9-clang10-build.outputs.test-matrix }}

-  linux-focal-rocm6_1-py3_8-build:
-    name: linux-focal-rocm6.1-py3.8
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
@ -144,19 +144,19 @@ jobs:
          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
        ]}

-  linux-focal-rocm6_1-py3_8-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_8-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}

  linux-jammy-py3_10-clang15-asan-build:
    name: linux-jammy-py3.10-clang15-asan
--- a/.github/workflows/sync_distributed_folder_prototype.yml
+++ b/.github/workflows/sync_distributed_folder_prototype.yml
@ -1,30 +0,0 @@
-name: Sync Distributed Folder
-
-on:
-  #push:
-  #  branches:
-  #    - 'main'
-  #  paths:
-  #    - 'torch/distributed/**'
-  workflow_dispatch:
-  pull_request:
-    paths:
-      - '.github/scripts/sync_distributed_folder_prototype.sh'
-      - '.github/workflows/sync_distributed_folder_prototype.yml'
-
-env:
-  WITH_PUSH: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
-
-permissions:
-  contents: write
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  sync:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - run: .github/scripts/sync_distributed_folder_prototype.sh
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -223,13 +223,13 @@ jobs:
      cuda-version: "12.1"
      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"

-  linux-focal-rocm6_1-py3_8-build:
-    name: linux-focal-rocm6.1-py3.8
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
@ -240,19 +240,19 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm6_1-py3_8-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_8-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"

  linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build:
@ -316,3 +316,11 @@ jobs:
      build-environment: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
      docker-image: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
+
+  linux-manylinux-2_28-py3-cpu-s390x-build:
+    name: linux-manylinux-2_28-py3-cpu-s390x
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-s390x-binary-manywheel
+      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      runner: linux.s390x
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@ -11,15 +11,39 @@ concurrency:

 jobs:
  do_update_viablestrict:
+    permissions:
+      id-token: write
    if: ${{ github.repository_owner == 'pytorch' }}
    runs-on: ubuntu-20.04
    environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }}
    steps:
      - name: Update viable/strict
        uses: pytorch/test-infra/.github/actions/update-viablestrict@main
+        id: update_viablestrict
        with:
          repository: pytorch/pytorch
          stable-branch: viable/strict
          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
          secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
          rockset-api-key: ${{ secrets.ROCKSET_API_KEY }}
+
+      - name: Authenticate to AWS with OIDC
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status
+          aws-region: us-east-1
+
+      - name: Print sha
+        env:
+          LATEST_SHA: ${{ steps.update_viablestrict.outputs.latest_viable_sha }}
+          PUSH_RESULT: ${{ steps.update_viablestrict.outputs.push_result }}
+          TIME: ${{ steps.update_viablestrict.outputs.time }}
+        run: |
+          echo "${PUSH_RESULT}"
+          if [ "$PUSH_RESULT" = "Everything up-to-date" ]; then
+            echo "No update pushed"
+          else
+            echo "{\"sha\": \"${LATEST_SHA}\", \"repository\":\"pytorch/pytorch\", \"timestamp\": ${TIME}}" > "/tmp/${LATEST_SHA}.json"
+            pip install awscli==1.29.40
+            aws s3 cp "/tmp/${LATEST_SHA}.json" "s3://ossci-raw-job-status/stable_pushes/pytorch/pytorch/${LATEST_SHA}.json"
+          fi
--- a/.github/workflows/upload-alerts.yml
+++ b/.github/workflows/upload-alerts.yml
@ -1,55 +0,0 @@
-# upload alerts every 10 minutes
-
-name: Upload Alerts to AWS/Rockset
-
-on:
-  schedule:
-    - cron: '*/10 * * * *'
-  pull_request:
-    paths:
-      - 'tools/alerts/create_alerts.py'
-      - '.github/workflows/upload-alerts.yml'
-
-jobs:
-  upload-alerts:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: ubuntu-22.04
-    environment: upload-stats
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.11'
-          cache: pip
-
-      - name: Install Python Packages
-        run: |
-          pip3 install rockset==1.0.3 boto3==1.19.12 requests==2.32.2
-
-      - name: Create alerts
-        run: |
-          output=$(PYTHONPATH=$PYTHONPATH:$(pwd) python3 "tools/alerts/create_alerts.py")
-          echo "uploading following alerts"
-          echo "$output"
-          echo "script-output=$output" >> "$GITHUB_OUTPUT"
-        id: alert_creation_step
-
-      - name: Upload alerts
-        env:
-          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-        uses: pytorch/test-infra/.github/actions/upload-alerts@main
-        with:
-          alerts: '${{ steps.alert_creation_step.outputs.script-output }}'
-          organization: "pytorch"
-          repo: "pytorch"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -139,7 +139,7 @@ init_command = [
    'numpy==1.24.3 ; python_version == "3.8"',
    'numpy==1.26.0 ; python_version >= "3.9"',
    'expecttest==0.2.1',
-    'mypy==1.10.0',
+    'mypy==1.11.2',
    'sympy==1.12.1 ; python_version == "3.8"',
    'sympy==1.13.0 ; python_version >= "3.9"',
    'types-requests==2.27.25',
@ -195,6 +195,7 @@ include_patterns = [
    # and excluding most sub-directories for now.
    'aten/src/ATen/*.h',
    'aten/src/ATen/*.cpp',
+    'aten/src/ATen/cuda/*.cpp',
    'aten/src/ATen/cpu/*.h',
    'aten/src/ATen/cpu/*.cpp',
    'aten/src/ATen/core/*.h',
@ -224,7 +225,6 @@ exclude_patterns = [
    # CUDA files are also excluded.
    '**/fb/**',
    '**/*pb.h',
-    'aten/**/cuda/*pp',
    'c10/xpu/**/*.h',
    'c10/xpu/**/*.cpp',
    'c10/cuda/CUDAAlgorithm.h',
@ -1585,6 +1585,27 @@ command = [
 ]
 is_formatter = true

+
+[[linter]]
+code = 'META_NO_CREATE_UNBACKED'
+include_patterns = [
+  "torch/_meta_registrations.py"
+]
+command = [
+    'python3',
+    'tools/linter/adapters/grep_linter.py',
+    '--pattern=create_unbacked',
+    '--linter-name=META_NO_CREATE_UNBACKED',
+    '--error-name=no create_unbacked in meta registrations',
+    """--error-description=\
+        Data-dependent operators should have their meta \
+        registration in torch/_subclasses/fake_impls.py, \
+        not torch/_meta_registrations.py
+    """,
+    '--',
+    '@{{PATHSFILE}}'
+]
+
 [[linter]]
 code = 'ATEN_CPU_GPU_AGNOSTIC'
 include_patterns = [
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -305,7 +305,6 @@ if(NOT DEFINED USE_VULKAN)
  cmake_dependent_option(USE_VULKAN "Use Vulkan GPU backend" ON "ANDROID" OFF)
 endif()

-option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF)
 option(USE_SOURCE_DEBUG_ON_MOBILE "Enable" ON)
 option(USE_LITE_INTERPRETER_PROFILER "Enable" ON)
 cmake_dependent_option(
@ -369,7 +368,7 @@ cmake_dependent_option(
    USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
 cmake_dependent_option(
    USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON
-    "USE_DISTRIBUTED" OFF)
+    "USE_DISTRIBUTED AND NOT WIN32" OFF)
 option(ONNX_ML "Enable traditional ONNX ML API." ON)
 option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF)
 option(BUILD_LIBTORCH_CPU_WITH_DEBUG
@ -912,11 +911,6 @@ if(USE_PYTORCH_QNNPACK)
  string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
 endif()

-if(USE_SLEEF_FOR_ARM_VEC256)
-  string(APPEND CMAKE_CXX_FLAGS " -DAT_BUILD_ARM_VEC256_WITH_SLEEF")
-  add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
-endif()
-
 # Enable sleef on macOS with Apple silicon by default
 if((${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64"))
  message(STATUS "Running on macOS with Apple silicon")
@ -924,6 +918,14 @@ if((${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") AND ("${CMAKE_SYSTEM_PROCESSOR}" STR
  add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
 endif()

+# Enable sleef on Arm(R) architecture by default (except Android)
+if((NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
+  AND("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64"))
+  string(APPEND CMAKE_CXX_FLAGS " -DAT_BUILD_ARM_VEC256_WITH_SLEEF")
+  add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
+endif()
+
+
 if(USE_XNNPACK)
  string(APPEND CMAKE_CXX_FLAGS " -DUSE_XNNPACK")
 endif()
--- a/4
+++ b/4
@ -98,6 +98,10 @@ test/test_type_promotion.py @mruberry
 test/functorch/test_ops.py @zou3519 @chillee @kshitij12345
 test/functorch/test_vmap.py @zou3519 @chillee @kshitij12345

+# HOPs
+torch/_higher_order_ops/*.py @zou3519
+torch/_dynamo/variables/higher_order_ops.py @zou3519
+
 # torch MPS
 test/test_mps.py @kulinseth @malfet
 aten/src/ATen/mps/ @kulinseth @malfet
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -50,7 +50,6 @@ aspects of contributing to PyTorch.
 - [Windows development tips](#windows-development-tips)
  - [Known MSVC (and MSVC with NVCC) bugs](#known-msvc-and-msvc-with-nvcc-bugs)
  - [Building on legacy code and CUDA](#building-on-legacy-code-and-cuda)
- [Running clang-tidy](#running-clang-tidy)
 - [Pre-commit tidy/linting hook](#pre-commit-tidylinting-hook)
 - [Building PyTorch with ASAN](#building-pytorch-with-asan)
  - [Getting `ccache` to work](#getting-ccache-to-work)
@ -1132,38 +1131,6 @@ CUDA, MSVC, and PyTorch versions are interdependent; please install matching ver

 Note: There's a [compilation issue](https://github.com/oneapi-src/oneDNN/issues/812) in several Visual Studio 2019 versions since 16.7.1, so please make sure your Visual Studio 2019 version is not in 16.7.1 ~ 16.7.5

-## Running clang-tidy
-
-[Clang-Tidy](https://clang.llvm.org/extra/clang-tidy/index.html) is a C++
-linter and static analysis tool based on the clang compiler. We run clang-tidy
-in our CI to make sure that new C++ code is safe, sane and efficient. See the
-[`clang-tidy` job in our GitHub Workflow's
-lint.yml file](https://github.com/pytorch/pytorch/blob/main/.github/workflows/lint.yml)
-for the simple commands we use for this.
-
-To run clang-tidy locally, follow these steps:
-
-1. Install clang-tidy.
-We provide custom built binaries which have additional checks enabled. You can install it by running:
-```bash
-python3 -m tools.linter.clang_tidy.generate_build_files
-```
-We currently only support Linux and MacOS (x86).
-
-2. Install clang-tidy driver script dependencies
-```bash
-pip3 install -r tools/linter/clang_tidy/requirements.txt
-```
-
-3. Run clang-tidy
-```bash
-# Run clang-tidy on the entire codebase
-make clang-tidy
-# Run clang-tidy only on your changes
-make clang-tidy CHANGED_ONLY=--changed-only
-```
-This internally invokes our driver script and closely mimics how clang-tidy is run on CI.
-
 ## Pre-commit tidy/linting hook

 We use clang-tidy to perform additional
--- a/RELEASE.md
+++ b/RELEASE.md
@ -48,16 +48,16 @@

 Following is the Release Compatibility Matrix for PyTorch releases:

-| PyTorch version | Python | Stable CUDA | Experimental CUDA | Stable ROCm |
-| --- | --- | --- | --- | --- |
-| 2.5 | >=3.9, <=3.12, (3.13 experimental) | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70  | None | ROCm 6.2 |
-| 2.4 | >=3.8, <=3.12 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70  | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
-| 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
-| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
-| 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 |
-| 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | ROCm 5.4 |
-| 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
-| 1.12 | >=3.7, <=3.10 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |
+| PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
+| --- | --- | --- | --- | --- | --- |
+| 2.5 | >=3.9, <=3.12, (3.13 experimental) | C++17 | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70  | None | ROCm 6.2 |
+| 2.4 | >=3.8, <=3.12 | C++17 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70  | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
+| 2.3 | >=3.8, <=3.11, (3.12 experimental) | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
+| 2.2 | >=3.8, <=3.11, (3.12 experimental) | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
+| 2.1 | >=3.8, <=3.11 | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 |
+| 2.0 | >=3.8, <=3.11 | C++14 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | ROCm 5.4 |
+| 1.13 | >=3.7, <=3.10 | C++14 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
+| 1.12 | >=3.7, <=3.10 | C++14 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |

 ## Release Cadence

@ -234,7 +234,7 @@ Typically, within a release cycle fixes are necessary for regressions, test fixe
 For fixes that are to go into a release after the release branch has been cut we typically employ the use of a cherry pick tracker.

 An example of this would look like:
-* https://github.com/pytorch/pytorch/issues/51886
+* https://github.com/pytorch/pytorch/issues/128436

 Please also make sure to add milestone target to the PR/issue, especially if it needs to be considered for inclusion into the dot release.

@ -243,7 +243,9 @@ Please also make sure to add milestone target to the PR/issue, especially if it
 #### How to do Cherry Picking

 You can now use `pytorchbot` to cherry pick a PyTorch PR that has been committed
-to the main branch using `@pytorchbot cherry-pick` command as follows.
+to the main branch using `@pytorchbot cherry-pick` command as follows (make sure
+that the cherry-pick tracker issue for the target release labelled as "release tracker" -
+this will allow the bot to find it and post comments).

 ```
 usage: @pytorchbot cherry-pick --onto ONTO [--fixes FIXES] -c
@ -380,7 +382,7 @@ Patch release process takes around 4-5 weeks to complete.
 ### Issue Tracker for Patch releases

 For patch releases issue tracker needs to be created. For patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like:
-* https://github.com/pytorch/pytorch/issues/51886
+* https://github.com/pytorch/pytorch/issues/128436

 Only following issues are accepted:
 1. Fixes to regressions against previous major version (e.g. regressions introduced in 1.13.0 from 1.12.0 are pickable for 1.13.1)
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -54,7 +54,7 @@ if(NOT BUILD_LITE_INTERPRETER)
 endif()
 EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})

-file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
+file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/sve/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
 file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp" "functorch/*.cpp")
 file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh" "cuda/tunable/*.cuh" "cuda/tunable/*.h")
 file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp" "cuda/tunable/*.cpp")
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -145,6 +145,14 @@ void Context::setSDPUseMath(bool e) {
  enabled_mathSDP = e;
 }

+bool Context::allowFP16BF16ReductionMathSDP() const {
+  return allow_fp16_bf16_reduction_mathSDP;
+}
+
+void Context::setAllowFP16BF16ReductionMathSDP(bool e) {
+  allow_fp16_bf16_reduction_mathSDP = e;
+}
+
 bool Context::userEnabledCuDNNSDP() const {
  return enabled_cudnnSDP;
 }
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -234,6 +234,9 @@ class TORCH_API Context {
  void setSDPUseCuDNN(bool);
  bool userEnabledCuDNNSDP() const;

+  void setAllowFP16BF16ReductionMathSDP(bool);
+  bool allowFP16BF16ReductionMathSDP() const;
+
  void setSDPUseOverrideable(bool);
  bool userEnabledOverrideableSDP() const;

@ -390,6 +393,7 @@ class TORCH_API Context {
  bool enabled_mathSDP = true;
  bool enabled_cudnnSDP = true;
  bool enabled_overrideable = true;
+  bool allow_fp16_bf16_reduction_mathSDP = false;
 #ifdef USE_ROCM
  bool benchmark_cudnn = true;
 #else
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@ -105,6 +105,11 @@ std::string get_cpu_capability() {
      return "DEFAULT";
    case native::CPUCapability::ZVECTOR:
      return "Z VECTOR";
+#elif defined(HAVE_SVE_CPU_DEFINITION)
+    case native::CPUCapability::DEFAULT:
+      return "DEFAULT";
+    case native::CPUCapability::SVE256:
+      return "SVE256";
 #else
    case native::CPUCapability::DEFAULT:
      return "NO AVX";
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@ -336,6 +336,7 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
  KERNEL_CPU(linalg_vecdot, lower_precision_fp)
  KERNEL_CPU(baddbmm, lower_precision_fp)
  KERNEL_CPU(addmm, lower_precision_fp)
+  KERNEL_CPU(_addmm_activation, lower_precision_fp)
  KERNEL_CPU(addbmm, lower_precision_fp)
  KERNEL_CPU(linear, lower_precision_fp)
  KERNEL_CPU(_convolution, deprecated, lower_precision_fp)
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -1,4 +1,6 @@
 #include <c10/core/Allocator.h>
+#include <c10/core/thread_pool.h>
+#include <c10/util/CallOnce.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/llvmMathExtras.h>
 #include <optional>
@ -109,6 +111,17 @@ template <
    typename E,
    typename B = HostBlock<S>>
 struct CachingHostAllocatorImpl {
+  CachingHostAllocatorImpl() {
+    // Launch the background thread and process events in a loop.
+    if (pinned_use_background_threads()) {
+      getBackgroundThreadPool()->run([&]() {
+        while (true) {
+          process_events();
+          std::this_thread::sleep_for(std::chrono::microseconds(100));
+        }
+      });
+    }
+  }
  virtual ~CachingHostAllocatorImpl() = default;

 public:
@ -118,17 +131,34 @@ struct CachingHostAllocatorImpl {
      return {nullptr, nullptr};
    }

-    process_events();
-
-    // First, try to allocate from the free list
-    auto* block = get_free_block(size);
-    if (block) {
-      return {block->ptr_, reinterpret_cast<void*>(block)};
+    // If we are using background threads, we can process events in the
+    // background.
+    if (!pinned_use_background_threads()) {
+      process_events();
    }

    // Round up the allocation to the nearest power of two to improve reuse.
    // These power of two sizes are also used to index into the free list.
    size_t roundSize = c10::llvm::PowerOf2Ceil(size);
+
+    // First, try to allocate from the free list
+    auto* block = get_free_block(roundSize);
+    if (block) {
+      return {block->ptr_, reinterpret_cast<void*>(block)};
+    }
+
+    // Check in the recently freed blocks with pending events to see if we
+    // can reuse them. Call get_free_block again after processing events
+    if (pinned_use_background_threads()) {
+      process_events_for_specific_size(roundSize);
+      block = get_free_block(roundSize);
+      if (block) {
+        return {block->ptr_, reinterpret_cast<void*>(block)};
+      }
+    }
+
+    // Slow path: if we can't allocate from the cached free list, we need
+    // to create a new block.
    void* ptr = nullptr;
    allocate_host_memory(roundSize, &ptr);

@ -237,6 +267,10 @@ struct CachingHostAllocatorImpl {
    return c10::llvm::Log2_64_Ceil(size);
  }

+  virtual bool pinned_use_background_threads() {
+    return false;
+  }
+
  virtual void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const {
    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for copy_data");
  }
@ -261,6 +295,21 @@ struct CachingHostAllocatorImpl {
  }

  virtual void process_events() {
+    // process all events until the last unready event, not for specific size.
+    process_events_for_specific_size(-1);
+  }
+
+  // If size is -1, process all events from backwards until the last unready
+  // event. Otherwise, process events for a specific size and on first ready block
+  // is found, add it to the free list and return.
+  virtual void process_events_for_specific_size(int64_t size) {
+    size_t event_count = 0;
+    size_t max_events = 0;
+    {
+      std::lock_guard<std::mutex> g(events_mutex_);
+      max_events = events_.size();
+    }
+
    while (true) {
      // Avoid calling cudaEventDestroy while holding a mutex, so move
      // intermediate events out of the lock into this object.
@ -278,6 +327,25 @@ struct CachingHostAllocatorImpl {
        return;
      }

+      if (size != -1) {
+        if (event_count++ > max_events) {
+          {
+            std::lock_guard<std::mutex> g(events_mutex_);
+            events_.push_front(std::move(*processed));
+          }
+          return;
+        }
+        if (size != (int64_t)processed->second->size_) {
+          // if we are processing a specific size, and the size of the block
+          // doesn't match, we can't use it.
+          {
+            std::lock_guard<std::mutex> g(events_mutex_);
+            events_.push_front(std::move(*processed));
+          }
+          continue;
+        }
+      }
+
      // otherwise, query the event
      {
        // now, see if we can handle this element
@ -286,9 +354,14 @@ struct CachingHostAllocatorImpl {
          // push the event onto the back if it's not ready.
          {
            std::lock_guard<std::mutex> g(events_mutex_);
-            events_.push_back(std::move(*processed));
+            if (size == -1) {
+              events_.push_back(std::move(*processed));
+              return;
+            } else {
+              events_.push_front(std::move(*processed));
+              continue;
+            }
          }
-          return;
        }
      }

@ -309,46 +382,54 @@ struct CachingHostAllocatorImpl {
        auto index = size_index(block->size_);
        std::lock_guard<std::mutex> g(free_list_[index].mutex_);
        free_list_[index].list_.push_back(block);
+        if (size != -1) {
+          return;
+        }
      }
    }
  }

-  /* These following functions are runtime-related. */
-
-  // Allocate page-locked memory on the host.
-  virtual void allocate_host_memory(size_t size, void** ptr) {
-    TORCH_CHECK_NOT_IMPLEMENTED(
-        false, "Not implemented for allocate_host_memory");
+  TaskThreadPool* getBackgroundThreadPool() {
+    static TaskThreadPool* pool = new TaskThreadPool(1);
+    return pool;
  }

-  // Free block and release the pointer contained in block.
-  virtual void free_block(B* block) {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for free_block");
-  }
+    /* These following functions are runtime-related. */

-  // Record an event on stream and store event into events.
-  virtual void record_stream(std::optional<std::vector<E>>& events, S stream) {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream");
-  }
+    // Allocate page-locked memory on the host.
+    virtual void allocate_host_memory(size_t size, void** ptr) {
+      TORCH_CHECK_NOT_IMPLEMENTED(
+          false, "Not implemented for allocate_host_memory");
+    }

-  // Query event if it is completed.
-  virtual bool query_event(E& event) {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
-  }
+    // Free block and release the pointer contained in block.
+    virtual void free_block(B* block) {
+      TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for free_block");
+    }

-  alignas(64) std::mutex blocks_mutex_;
-  ska::flat_hash_set<B*> blocks_; // block list
-  ska::flat_hash_map<void*, B*> ptr_to_block_;
+    // Record an event on stream and store event into events.
+    virtual void record_stream(std::optional<std::vector<E>>& events, S stream) {
+      TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream");
+    }

-  // We keep free list as a vector of free lists, one for each power of two
-  // size. This allows us to quickly find a free block of the right size.
-  // We use deque to store per size free list and guard the list with its own
-  // mutex.
-  alignas(64) std::vector<FreeBlockList<B>> free_list_ = std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
+    // Query event if it is completed.
+    virtual bool query_event(E& event) {
+      TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
+    }

-  alignas(64) std::mutex events_mutex_;
-  std::deque<std::pair<E, B*>> events_; // event queue paired with block
-};
+    alignas(64) std::mutex blocks_mutex_;
+    ska::flat_hash_set<B*> blocks_; // block list
+    ska::flat_hash_map<void*, B*> ptr_to_block_;
+
+    // We keep free list as a vector of free lists, one for each power of two
+    // size. This allows us to quickly find a free block of the right size.
+    // We use deque to store per size free list and guard the list with its own
+    // mutex.
+    alignas(64) std::vector<FreeBlockList<B>> free_list_ = std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
+
+    alignas(64) std::mutex events_mutex_;
+    std::deque<std::pair<E, B*>> events_; // event queue paired with block
+  };

 template <typename T>
 struct CachingHostAllocatorInterface : public at::Allocator {
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@ -45,7 +45,7 @@ private:
  c10::impl::LocalDispatchKeySet saved_;
 };

-void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+void pythonFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
  TORCH_INTERNAL_ASSERT(tls_on_entry.has_value());
  // c10::impl::ForceDispatchKeyGuard dispatcher_guard(tls_on_entry.value());
  // StashTLSOnEntryGuard stash_guard;
@ -68,12 +68,20 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
  // we actually run dispatch(), we will take out PyObjects in the context
  // of that interpreter, and this will ensure that everyone is on the same
  // interpreter.
+  bool tensors_with_python_key_present = false;
+  c10::impl::PyInterpreter* interpreter = nullptr;
  for (const auto& ivalue : torch::jit::last(*stack, num_arguments)) {
    if (ivalue.isTensor()) {
-      auto* interpreter = ivalue.unsafeToTensorImpl()->pyobj_slot()->pyobj_interpreter();
-      if (interpreter) {
-        (*interpreter)->dispatch(op, stack);
-        return;
+      auto* t = ivalue.unsafeToTensorImpl();
+      if (t->key_set().has(c10::DispatchKey::Python)) {
+        tensors_with_python_key_present = true;
+      }
+
+      if (!interpreter) {
+        auto* t_interpreter = t->pyobj_slot()->pyobj_interpreter();
+        if (t_interpreter) {
+          interpreter = t_interpreter;
+        }
      }
    } else if (ivalue.isTensorList() || ivalue.isOptionalTensorList()) {
      // NB: use toListRef as it doesn't induce refcount bumps (toTensorListRef
@ -82,14 +90,43 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
        if (nv.isNone()) {
          continue;
        }
-        auto* interpreter = nv.unsafeToTensorImpl()->pyobj_slot()->pyobj_interpreter();
-        if (interpreter) {
-          (*interpreter)->dispatch(op, stack);
-          return;
+
+        auto* t = nv.unsafeToTensorImpl();
+        if (t->key_set().has(c10::DispatchKey::Python)) {
+          tensors_with_python_key_present = true;
+        }
+
+        if (!interpreter) {
+          auto* t_interpreter = t->pyobj_slot()->pyobj_interpreter();
+          if (t_interpreter) {
+            interpreter = t_interpreter;
+          }
        }
      }
    }
  }
+
+  if (interpreter) {
+    if (tensors_with_python_key_present) {
+      (*interpreter)->dispatch(op, stack);
+    } else {
+      // At this point, there are no modes in the stack and no tensors with the python key.
+      // so disable the python key before redispatching.
+      // See https://github.com/pytorch/pytorch/issues/136565
+      c10::DispatchKeySet keyset = dispatch_keys.remove(c10::DispatchKey::Python);
+
+      // Remove Python key from the included set as well (modes add it there).
+      c10::impl::LocalDispatchKeySet local_keyset = c10::impl::tls_local_dispatch_key_set();
+      c10::impl::ForceDispatchKeyGuard no_python_guard(
+        local_keyset.included_.remove(c10::DispatchKey::Python),
+        local_keyset.excluded_
+      );
+
+      op.redispatchBoxed(keyset, stack);
+    }
+    return;
+  }
+
  TORCH_INTERNAL_ASSERT(0, "Hit Python dispatch key but no arguments had PyInterpreter (no tensor args?)");
 }

--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@ -78,7 +78,7 @@ struct VecReduceAllSIMD<float, Op> {
 #endif // defined(CPU_CAPABILITY_AVX512)
 #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE)

-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE)
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
  static inline float apply(const Op& vec_fun, const Vectorized<float>& acc_vec) {
--- a/aten/src/ATen/cpu/vec/intrinsics.h
+++ b/aten/src/ATen/cpu/vec/intrinsics.h
@ -5,6 +5,10 @@
 #elif defined(__clang__) && (defined(__ARM_NEON__) || defined(__aarch64__))
 /* Clang-compatible compiler, targeting arm neon */
 #include <arm_neon.h>
+#if defined(__ARM_FEATURE_SVE)
+/* CLANG-compatible compiler, targeting ARM with SVE */
+#include <arm_sve.h>
+#endif
 #elif defined(_MSC_VER)
 /* Microsoft C/C++-compatible compiler */
 #include <intrin.h>
@ -17,6 +21,10 @@
 #elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
 /* GCC-compatible compiler, targeting ARM with NEON */
 #include <arm_neon.h>
+#if defined(__ARM_FEATURE_SVE)
+/* GCC-compatible compiler, targeting ARM with SVE */
+#include <arm_sve.h>
+#endif
 #if defined (MISSING_ARM_VLD1)
 #include <ATen/cpu/vec/vec256/missing_vld1_neon.h>
 #elif defined (MISSING_ARM_VST1)
--- a/aten/src/ATen/cpu/vec/sve/sve_helper.h
+++ b/aten/src/ATen/cpu/vec/sve/sve_helper.h
@ -0,0 +1,63 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+
+#include <ATen/cpu/vec/vec_base.h>
+
+#if defined(CPU_CAPABILITY_SVE)
+
+// Define the data type of VLS(vector-length specific).
+typedef svbool_t vls_pred_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint8_t vls_int8_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint16_t vls_int16_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint32_t vls_int32_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint64_t vls_int64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint8_t vls_uint8_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint16_t vls_uint16_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint32_t vls_uint32_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint64_t vls_uint64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svfloat16_t vls_float16_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svfloat32_t vls_float32_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svfloat64_t vls_float64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+
+#define ptrue svptrue_b8()
+#define ZERO_S8 svdup_n_s8(0)
+#define ZERO_S16 svdup_n_s16(0)
+#define ZERO_S32 svdup_n_s32(0)
+#define ZERO_S64 svdup_n_s64(0)
+#define ZERO_U8 svdup_n_u8(0)
+#define ZERO_U16 svdup_n_u16(0)
+#define ZERO_U32 svdup_n_u32(0)
+#define ZERO_U64 svdup_n_u64(0)
+#define ZERO_F16 svdup_n_f16(0.f)
+#define ZERO_F32 svdup_n_f32(0.f)
+#define ZERO_F64 svdup_n_f64(0.0)
+#define ONE_S8  svdup_n_s8(1)
+#define ONE_S16 svdup_n_s16(1)
+#define ONE_S32 svdup_n_s32(1)
+#define ONE_S64 svdup_n_s64(1)
+#define ONE_U8 svdup_n_u8(1)
+#define ONE_U16 svdup_n_u16(1)
+#define ONE_U32 svdup_n_u32(1)
+#define ONE_U64 svdup_n_u64(1)
+#define ONE_F16 svdup_n_f16(1.f)
+#define ONE_F32 svdup_n_f32(1.f)
+#define ONE_F64 svdup_n_f64(1.0)
+#define ALL_S8_TRUE_MASK svdup_n_s8(0xff)
+#define ALL_S8_FALSE_MASK svdup_n_s8(0x0)
+#define ALL_S16_TRUE_MASK svdup_n_s16(0xffff)
+#define ALL_S16_FALSE_MASK svdup_n_s16(0x0)
+#define ALL_S32_TRUE_MASK svdup_n_s32(0xffffffff)
+#define ALL_S32_FALSE_MASK svdup_n_s32(0x0)
+#define ALL_S64_TRUE_MASK svdup_n_s64(0xffffffffffffffff)
+#define ALL_S64_FALSE_MASK svdup_n_s64(0x0)
+#define ALL_U8_TRUE_MASK svdup_n_u8(0x01)
+#define ALL_U8_FALSE_MASK svdup_n_u8(0x00)
+#define ALL_F16_TRUE_MASK svreinterpret_f16_s16(ALL_S16_TRUE_MASK)
+#define ALL_F16_FALSE_MASK svreinterpret_f16_s16(ALL_S16_FALSE_MASK)
+#define ALL_F32_TRUE_MASK svreinterpret_f32_s32(ALL_S32_TRUE_MASK)
+#define ALL_F32_FALSE_MASK svreinterpret_f32_s32(ALL_S32_FALSE_MASK)
+#define ALL_F64_TRUE_MASK svreinterpret_f64_s64(ALL_S64_TRUE_MASK)
+#define ALL_F64_FALSE_MASK svreinterpret_f64_s64(ALL_S64_FALSE_MASK)
+
+#endif // defined(CPU_CAPABILITY_SVE)
--- a/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
@ -0,0 +1,176 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with SVE]
+
+#include <ATen/cpu/vec/intrinsics.h>
+
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/sve/sve_helper.h>
+
+#if defined(CPU_CAPABILITY_SVE)
+#include <ATen/cpu/vec/sve/vec_float.h>
+#include <ATen/cpu/vec/sve/vec_double.h>
+#include <ATen/cpu/vec/sve/vec_int.h>
+#include <ATen/cpu/vec/sve/vec_qint.h>
+#endif
+
+namespace at {
+namespace vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<>
+inline Vectorized<float> cast<float, double>(const Vectorized<double>& src) {
+  return svreinterpret_f32_f64(src);
+}
+
+template<>
+inline Vectorized<double> cast<double, float>(const Vectorized<float>& src) {
+  return svreinterpret_f64_f32(src);
+}
+
+#define DEFINE_FLOAT_INT_CAST(int_t, int_bit, float_t, float_bit)                \
+template<>                                                                       \
+inline  Vectorized<int_t> cast<int_t, float_t>(const Vectorized<float_t>& src) { \
+  return svreinterpret_s##int_bit##_f##float_bit(src);                           \
+}                                                                                \
+template<>                                                                       \
+inline Vectorized<float_t> cast<float_t, int_t>(const Vectorized<int_t>& src) {  \
+  return svreinterpret_f##float_bit##_s##int_bit(src);                           \
+}
+
+DEFINE_FLOAT_INT_CAST(int64_t, 64, double, 64)
+DEFINE_FLOAT_INT_CAST(int32_t, 32, double, 64)
+DEFINE_FLOAT_INT_CAST(int16_t, 16, double, 64)
+DEFINE_FLOAT_INT_CAST(int64_t, 64, float, 32)
+DEFINE_FLOAT_INT_CAST(int32_t, 32, float, 32)
+DEFINE_FLOAT_INT_CAST(int16_t, 16, float, 32)
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+inline gather(const double* base_addr, const Vectorized<int64_t>& vindex_) {
+  svint64_t vindex = svasrd_n_s64_x(ptrue, svmul_s64_x(ptrue, vindex_, svdup_n_s64(scale)), 3);
+  return svld1_gather_s64index_f64(ptrue, base_addr, vindex);
+}
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
+inline gather(const float* base_addr, const Vectorized<int32_t>& vindex_) {
+  svint32_t vindex = svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2);
+  return svld1_gather_s32index_f32(ptrue, base_addr, vindex);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+inline mask_gather(const Vectorized<double>& src, const double* base_addr,
+                   const Vectorized<int64_t>& vindex_, const Vectorized<double>& mask_) {
+  svbool_t mask = svcmpeq_s64(ptrue, svreinterpret_s64_f64(mask_),
+                              ALL_S64_TRUE_MASK);
+  svint64_t vindex = svasrd_n_s64_x(ptrue, svmul_s64_x(ptrue, vindex_, svdup_n_s64(scale)), 3);
+  return svsel_f64(mask, svld1_gather_s64index_f64(mask, base_addr, vindex), src);
+}
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
+inline mask_gather(const Vectorized<float>& src, const float* base_addr,
+                   const Vectorized<int32_t>& vindex_, const Vectorized<float>& mask_) {
+  svbool_t mask = svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_),
+                              ALL_S32_TRUE_MASK);
+  svint32_t vindex = svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2);
+  return svsel_f32(mask, svld1_gather_s32index_f32(mask, base_addr, vindex), src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+// Only works for inputs in the range: [-2^51, 2^51]
+// From: https://stackoverflow.com/a/41148578
+template<>
+Vectorized<int64_t>
+inline convert_to_int_of_same_size<double>(const Vectorized<double> &src) {
+  svfloat64_t x = svadd_f64_x(ptrue, src, svdup_n_f64(0x0018000000000000));
+  return svsub_s64_x(ptrue,
+                     svreinterpret_s64_f64(x),
+                     svreinterpret_s64_f64(svdup_n_f64(0x0018000000000000)));
+}
+
+template<>
+Vectorized<int32_t>
+inline convert_to_int_of_same_size<float>(const Vectorized<float> &src) {
+  return svcvt_s32_f32_x(ptrue, src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>>
+inline interleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, a1, a3, a3}
+  //   b = {b0, b1, b2, b3}
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1}
+  //          {a2, b2, a3, b3}
+  return std::make_pair(Vectorized<double>(svzip1_f64(a, b)),
+                        Vectorized<double>(svzip2_f64(a, b)));
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>>
+inline interleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3, b4, b5, b6, b7}
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5, a6, b6, a7, b7}
+  return std::make_pair(Vectorized<float>(svzip1_f32(a, b)),
+                        Vectorized<float>(svzip2_f32(a, b)));
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>>
+inline deinterleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1}
+  //   b = {a2, b2, a3, b3}
+  // swap lanes:
+  //   return {a0, a1, a2, a3}
+  //          {b0, b1, b2, b3}
+  return std::make_pair(Vectorized<double>(svuzp1_f64(a, b)),
+                        Vectorized<double>(svuzp2_f64(a, b)));
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>>
+inline deinterleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5, a6, b6, a7, b7}
+  // swap lanes:
+  //   return {a0, a1, a2, a3, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3, b4, b5, b6, b7}
+  return std::make_pair(Vectorized<float>(svuzp1_f32(a, b)),
+                        Vectorized<float>(svuzp2_f32(a, b)));
+}
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+}}}
--- a/aten/src/ATen/cpu/vec/sve/vec_double.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_double.h
@ -0,0 +1,505 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/sve/sve_helper.h>
+#include <cmath>
+#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#include <sleef.h>
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+namespace at {
+namespace vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+template <> class Vectorized<double> {
+private:
+  vls_float64_t values;
+public:
+  using value_type = double;
+  using size_type = int;
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(double);
+  }
+  Vectorized() {}
+  Vectorized(svfloat64_t v) : values(v) {}
+  Vectorized(double val) {
+    values = svdup_n_f64(val);
+  }
+  template<typename... Args,
+           typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  Vectorized(Args... vals) {
+    __at_align__ double buffer[size()] = { vals... };
+    values = svld1_f64(ptrue, buffer);
+  }
+  operator svfloat64_t() const {
+    return values;
+  }
+  static Vectorized<double> blendv(const Vectorized<double>& a, const Vectorized<double>& b,
+                              const Vectorized<double>& mask_) {
+    svbool_t mask = svcmpeq_s64(ptrue, svreinterpret_s64_f64(mask_),
+                               ALL_S64_TRUE_MASK);
+    return svsel_f64(mask, b, a);
+  }
+  template<typename step_t>
+  static Vectorized<double> arange(double base = 0., step_t step = static_cast<step_t>(1)) {
+    __at_align__ double buffer[size()];
+    for (int64_t i = 0; i < size(); i++) {
+      buffer[i] = base + i * step;
+    }
+    return svld1_f64(ptrue, buffer);
+  }
+  static Vectorized<double> set(const Vectorized<double>& a, const Vectorized<double>& b,
+                           int64_t count = size()) {
+    if (count == 0) {
+      return a;
+    } else if (count < size()) {
+      return svsel_f64(svwhilelt_b64(0ull, count), b, a);
+    }
+    return b;
+  }
+  static Vectorized<double> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return svld1_f64(ptrue, reinterpret_cast<const double*>(ptr));
+    svbool_t pg = svwhilelt_b64(0ull, count);
+    return svld1_f64(pg, reinterpret_cast<const double*>(ptr));
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      svst1_f64(ptrue, reinterpret_cast<double*>(ptr), values);
+    } else {
+      svbool_t pg = svwhilelt_b64(0ull, count);
+      svst1_f64(pg, reinterpret_cast<double*>(ptr), values);
+    }
+  }
+  const double& operator[](int idx) const  = delete;
+  double& operator[](int idx) = delete;
+  int64_t zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+    int64_t mask = 0;
+    __at_align__ int64_t mask_array[size()];
+
+    svbool_t svbool_mask = svcmpeq_f64(ptrue, values, ZERO_F64);
+    svst1_s64(ptrue, mask_array, svsel_s64(svbool_mask,
+                                          ALL_S64_TRUE_MASK,
+                                          ALL_S64_FALSE_MASK));
+    for (int64_t i = 0; i < size(); ++i) {
+      if (mask_array[i]) mask |= (1ull << i);
+    }
+    return mask;
+  }
+  Vectorized<double> isnan() const {
+    // NaN check
+    svbool_t mask = svcmpuo_f64(ptrue, values, ZERO_F64);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+  bool has_inf_nan() const {
+    return svptest_any(ptrue, svcmpuo_f64(ptrue, svsub_f64_x(ptrue, values, values), ZERO_F64));
+  }
+  Vectorized<double> map(double (*f)(double)) const {
+    __at_align__ double tmp[size()];
+    store(tmp);
+    for (int64_t i = 0; i < size(); ++i) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> abs() const {
+    return svabs_f64_x(ptrue, values);
+  }
+  Vectorized<double> angle() const {
+    const auto nan_vec = svdup_n_f64(NAN);
+    const auto nan_mask = svcmpuo_f64(ptrue, values, ZERO_F64);
+    const auto pi = svdup_n_f64(c10::pi<double>);
+
+    const auto neg_mask = svcmplt_f64(ptrue, values, ZERO_F64);
+    auto angle = svsel_f64(neg_mask, pi, ZERO_F64);
+    angle = svsel_f64(nan_mask, nan_vec, angle);
+    return angle;
+  }
+  Vectorized<double> real() const {
+    return *this;
+  }
+  Vectorized<double> imag() const {
+    return Vectorized<double>(0.0);
+  }
+  Vectorized<double> conj() const {
+    return *this;
+  }
+  Vectorized<double> acos() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_acosdx_u10sve(values)),map(std::acos));
+  }
+  Vectorized<double> acosh() const {
+    return USE_SLEEF( Vectorized<double>(Sleef_acoshdx_u10sve(values)),map(std::acosh));
+  }
+  Vectorized<double> asin() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_asindx_u10sve(values)),map(std::asin));
+  }
+  Vectorized<double> atan() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_atandx_u10sve(values)),map(std::atan));
+  }
+  Vectorized<double> atanh() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_atanhdx_u10sve(values)),map(std::atanh));
+  }
+  Vectorized<double> atan2(const Vectorized<double> &b) const {
+    USE_SLEEF({return Vectorized<double>(Sleef_atan2dx_u10sve(values, b));},
+      {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::atan2(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      }
+    )
+  }
+  Vectorized<double> copysign(const Vectorized<double> &sign) const {
+   USE_SLEEF( {return Vectorized<double>(Sleef_copysigndx_sve(values, sign));},
+     {
+       __at_align__ double tmp[size()];
+       __at_align__ double tmp_sign[size()];
+       store(tmp);
+       sign.store(tmp_sign);
+       for (int64_t i = 0; i < size(); i++) {
+         tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
+       }
+       return loadu(tmp);
+     }
+   )
+  }
+  Vectorized<double> erf() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_erfdx_u10sve(values)),map(std::erf));
+  }
+  Vectorized<double> erfc() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_erfcdx_u15sve(values)),map(std::erfc));
+  }
+  Vectorized<double> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<double> exp() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_expdx_u10sve(values)),map(std::exp));
+  }
+  Vectorized<double> exp2() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_exp2dx_u10sve(values)),map(std::exp2));
+  }
+  Vectorized<double> expm1() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_expm1dx_u10sve(values)),map(std::expm1));
+  }
+  Vectorized<double> exp_u20() const {
+    return exp();
+  }
+  Vectorized<double> fmod(const Vectorized<double>& q) const {
+    USE_SLEEF({return Vectorized<double>(Sleef_fmoddx_sve(values, q));},
+    {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_q[size()];
+        store(tmp);
+        q.store(tmp_q);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::fmod(tmp[i], tmp_q[i]);
+        }
+        return loadu(tmp);
+      }
+    )
+  }
+  Vectorized<double> hypot(const Vectorized<double> &b) const {
+    USE_SLEEF({return Vectorized<double>(Sleef_hypotdx_u05sve(values, b));},
+    {
+      __at_align__ double tmp[size()];
+      __at_align__ double tmp_b[size()];
+      store(tmp);
+      b.store(tmp_b);
+      for (int64_t i = 0; i < size(); i++) {
+        tmp[i] = std::hypot(tmp[i], tmp_b[i]);
+      }
+      return loadu(tmp);
+    })
+  }
+  Vectorized<double> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<double> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<double> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<double> igamma(const Vectorized<double> &x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> igammac(const Vectorized<double> &x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> nextafter(const Vectorized<double> &b) const {
+    USE_SLEEF(
+      {
+        return Vectorized<double>(Sleef_nextafterdx_sve(values, b));
+      },
+      {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); ++i) {
+          tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      }
+    )
+  }
+  Vectorized<double> log() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_logdx_u10sve(values)),map(std::log));
+  }
+  Vectorized<double> log2() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_log2dx_u10sve(values)),map(std::log2));
+  }
+  Vectorized<double> log10() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_log10dx_u10sve(values)),map(std::log10));
+  }
+  Vectorized<double> log1p() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_log1pdx_u10sve(values)),map(std::log1p));
+  }
+  Vectorized<double> frac() const;
+  Vectorized<double> sin() const {
+    return USE_SLEEF( Vectorized<double>(Sleef_sindx_u10sve(values)),map(std::sin));
+  }
+  Vectorized<double> sinh() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_sinhdx_u10sve(values)),map(std::sinh));
+  }
+  Vectorized<double> cos() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_cosdx_u10sve(values)),map(std::cos));
+  }
+  Vectorized<double> cosh() const {
+    return USE_SLEEF( Vectorized<double>(Sleef_coshdx_u10sve(values)),map(std::cosh));
+  }
+  Vectorized<double> ceil() const {
+    return svrintp_f64_x(ptrue, values);
+  }
+  Vectorized<double> floor() const {
+    return svrintm_f64_x(ptrue, values);
+  }
+  Vectorized<double> neg() const {
+    return svneg_f64_x(ptrue, values);
+  }
+  Vectorized<double> round() const {
+    return svrinti_f64_x(ptrue, values);
+  }
+  Vectorized<double> tan() const {
+    return USE_SLEEF( Vectorized<double>(Sleef_tandx_u10sve(values)),map(std::tan));
+  }
+  Vectorized<double> tanh() const {
+    return USE_SLEEF( Vectorized<double>(Sleef_tanhdx_u10sve(values)),map(std::tanh));
+  }
+  Vectorized<double> trunc() const {
+    return svrintz_f64_x(ptrue, values);
+  }
+  Vectorized<double> lgamma() const {
+    return USE_SLEEF( Vectorized<double>(Sleef_lgammadx_u10sve(values)),map(std::lgamma));
+  }
+  Vectorized<double> sqrt() const {
+    return svsqrt_f64_x(ptrue, values);
+  }
+  Vectorized<double> reciprocal() const {
+    return svdivr_f64_x(ptrue, values, ONE_F64);
+  }
+  Vectorized<double> rsqrt() const {
+    return svdivr_f64_x(ptrue, svsqrt_f64_x(ptrue, values), ONE_F64);
+  }
+  Vectorized<double> pow(const Vectorized<double> &b) const {
+   USE_SLEEF( {return Vectorized<double>(Sleef_powdx_u10sve(values, b));},
+    {
+      __at_align__ double tmp[size()];
+      __at_align__ double tmp_b[size()];
+      store(tmp);
+      b.store(tmp_b);
+      for (int64_t i = 0; i < size(); i++) {
+        tmp[i] = std::pow(tmp[i], tmp_b[i]);
+      }
+      return loadu(tmp);
+    }
+    )
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<double> operator==(const Vectorized<double>& other) const {
+    svbool_t mask = svcmpeq_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator!=(const Vectorized<double>& other) const {
+    svbool_t mask = svcmpne_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator<(const Vectorized<double>& other) const {
+    svbool_t mask = svcmplt_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator<=(const Vectorized<double>& other) const {
+    svbool_t mask = svcmple_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator>(const Vectorized<double>& other) const {
+    svbool_t mask = svcmpgt_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator>=(const Vectorized<double>& other) const {
+    svbool_t mask = svcmpge_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> eq(const Vectorized<double>& other) const;
+  Vectorized<double> ne(const Vectorized<double>& other) const;
+  Vectorized<double> gt(const Vectorized<double>& other) const;
+  Vectorized<double> ge(const Vectorized<double>& other) const;
+  Vectorized<double> lt(const Vectorized<double>& other) const;
+  Vectorized<double> le(const Vectorized<double>& other) const;
+};
+
+template <>
+Vectorized<double> inline operator+(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svadd_f64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<double> inline operator-(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svsub_f64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<double> inline operator*(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svmul_f64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<double> inline operator/(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svdiv_f64_x(ptrue, a, b);
+}
+
+// frac. Implement this here so we can use subtraction
+Vectorized<double> inline Vectorized<double>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline maximum(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svmax_f64_x(ptrue, a, b);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline minimum(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svmin_f64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<double> inline clamp(const Vectorized<double>& a, const Vectorized<double>& min, const Vectorized<double>& max) {
+  return svmin_f64_x(ptrue, max, svmax_f64_x(ptrue, min, a));
+}
+
+template <>
+Vectorized<double> inline clamp_max(const Vectorized<double>& a, const Vectorized<double>& max) {
+  return svmin_f64_x(ptrue, max, a);
+}
+
+template <>
+Vectorized<double> inline clamp_min(const Vectorized<double>& a, const Vectorized<double>& min) {
+  return svmax_f64_x(ptrue, min, a);
+}
+
+template <>
+Vectorized<double> inline operator&(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svreinterpret_f64_s64(svand_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
+}
+
+template <>
+Vectorized<double> inline operator|(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svreinterpret_f64_s64(svorr_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
+}
+
+template <>
+Vectorized<double> inline operator^(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svreinterpret_f64_s64(sveor_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
+}
+
+Vectorized<double> inline Vectorized<double>::eq(const Vectorized<double>& other) const {
+  return (*this == other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::ne(const Vectorized<double>& other) const {
+  return (*this != other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::gt(const Vectorized<double>& other) const {
+  return (*this > other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::ge(const Vectorized<double>& other) const {
+  return (*this >= other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::lt(const Vectorized<double>& other) const {
+  return (*this < other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::le(const Vectorized<double>& other) const {
+  return (*this <= other) & Vectorized<double>(1.0);
+}
+
+template <>
+inline void convert(const double* src, double* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<double>::size();
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<double>::size()) {
+    svst1_f64(ptrue, dst + i, svldnt1_f64(ptrue, src + i));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<double>::size()) {
+    svbool_t pg = svwhilelt_b64(i, n);
+    svst1_f64(pg, dst + i, svldnt1_f64(pg, src + i));
+  }
+}
+
+template <>
+Vectorized<double> inline fmadd(const Vectorized<double>& a, const Vectorized<double>& b, const Vectorized<double>& c) {
+  return svmad_f64_x(ptrue, a, b, c);
+}
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+}}}
--- a/aten/src/ATen/cpu/vec/sve/vec_float.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_float.h
@ -0,0 +1,570 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/sve/sve_helper.h>
+#include <cmath>
+#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#include <sleef.h>
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+namespace at {
+namespace vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+template <> class Vectorized<float> {
+private:
+  vls_float32_t values;
+public:
+  using value_type = float;
+  using size_type = int;
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(float);
+  }
+  Vectorized() {}
+  Vectorized(svfloat32_t v) : values(v) {}
+  Vectorized(float val) {
+    values = svdup_n_f32(val);
+  }
+  template<typename... Args,
+           typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  Vectorized(Args... vals) {
+    __at_align__ float buffer[size()] = { vals... };
+    values = svld1_f32(ptrue, buffer);
+  }
+  operator svfloat32_t() const {
+    return values;
+  }
+  static Vectorized<float> blendv(const Vectorized<float>& a, const Vectorized<float>& b,
+                              const Vectorized<float>& mask_) {
+    svbool_t mask = svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_),
+                               ALL_S32_TRUE_MASK);
+    return svsel_f32(mask, b, a);
+  }
+  template<typename step_t>
+  static Vectorized<float> arange(float base = 0.f, step_t step = static_cast<step_t>(1)) {
+    __at_align__ float buffer[size()];
+    for (int64_t i = 0; i < size(); i++) {
+      buffer[i] = base + i * step;
+    }
+    return svld1_f32(ptrue, buffer);
+  }
+  static Vectorized<float> set(const Vectorized<float>& a, const Vectorized<float>& b,
+                           int64_t count = size()) {
+    if (count == 0) {
+      return a;
+    } else if (count < size()) {
+      return svsel_f32(svwhilelt_b32(0ull, count), b, a);
+    }
+    return b;
+  }
+  static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return svld1_f32(ptrue, reinterpret_cast<const float*>(ptr));
+    svbool_t pg = svwhilelt_b32(0ull, count);
+    return svld1_f32(pg, reinterpret_cast<const float*>(ptr));
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      svst1_f32(ptrue, reinterpret_cast<float*>(ptr), values);
+    } else {
+      svbool_t pg = svwhilelt_b32(0ull, count);
+      svst1_f32(pg, reinterpret_cast<float*>(ptr), values);
+    }
+  }
+  const float& operator[](int idx) const  = delete;
+  float& operator[](int idx) = delete;
+  int64_t zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+    int64_t mask = 0;
+    __at_align__ int32_t mask_array[size()];
+
+    svbool_t svbool_mask = svcmpeq_f32(ptrue, values, ZERO_F32);
+    svst1_s32(ptrue, mask_array, svsel_s32(svbool_mask,
+                                          ALL_S32_TRUE_MASK,
+                                          ALL_S32_FALSE_MASK));
+    for (int64_t i = 0; i < size(); ++i) {
+      if (mask_array[i]) mask |= (1ull << i);
+    }
+    return mask;
+  }
+  Vectorized<float> isnan() const {
+    // NaN check
+    svbool_t mask = svcmpuo_f32(ptrue, values, ZERO_F32);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+  bool has_inf_nan() const {
+    return svptest_any(ptrue, svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32));
+  }
+  Vectorized<float> map(float (*f)(float)) const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (int64_t i = 0; i < size(); ++i) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> abs() const {
+    return svabs_f32_x(ptrue, values);
+  }
+  Vectorized<float> angle() const {
+    const auto nan_vec = svdup_n_f32(NAN);
+    const auto nan_mask = svcmpuo_f32(ptrue, values, ZERO_F32);
+    const auto pi = svdup_n_f32(c10::pi<float>);
+
+    const auto neg_mask = svcmplt_f32(ptrue, values, ZERO_F32);
+    auto angle = svsel_f32(neg_mask, pi, ZERO_F32);
+    angle = svsel_f32(nan_mask, nan_vec, angle);
+    return angle;
+  }
+  Vectorized<float> real() const {
+    return values;
+  }
+  Vectorized<float> imag() const {
+    return Vectorized<float>(0.f);
+  }
+  Vectorized<float> conj() const {
+    return values;
+  }
+  Vectorized<float> acos() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_acosfx_u10sve(values)),map(std::acos));
+  }
+  Vectorized<float> acosh() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_acoshfx_u10sve(values)),map(std::acosh));
+  }
+  Vectorized<float> asin() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_asinfx_u10sve(values)),map(std::asin));
+  }
+  Vectorized<float> atan() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_atanfx_u10sve(values)),map(std::atan));
+  }
+  Vectorized<float> atanh() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_atanhfx_u10sve(values)),map(std::atanh));
+  }
+  Vectorized<float> atan2(const Vectorized<float> &b) const {
+     USE_SLEEF({return Vectorized<float>(Sleef_atan2fx_u10sve(values, b));},
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); i++){
+          tmp[i] = std::atan2(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      }
+      )
+  }
+  Vectorized<float> copysign(const Vectorized<float> &sign) const {
+
+    USE_SLEEF({return Vectorized<float>(Sleef_copysignfx_sve(values, sign));},
+    {
+      __at_align__ float tmp[size()];
+      __at_align__ float tmp_sign[size()];
+      store(tmp);
+      sign.store(tmp_sign);
+      for (int64_t i = 0; i < size(); ++i) {
+        tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
+      }
+      return loadu(tmp);
+    })
+  }
+  Vectorized<float> erf() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_erffx_u10sve(values)),map(std::erf));
+  }
+  Vectorized<float> erfc() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_erfcfx_u15sve(values)),map(std::erfc));
+  }
+  Vectorized<float> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<float> exp() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_expfx_u10sve(values)),map(std::exp));
+  }
+  Vectorized<float> exp2() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_exp2fx_u10sve(values)),map(std::exp2));
+  }
+  Vectorized<float> expm1() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_expm1fx_u10sve(values)),map(std::expm1));
+  }
+  Vectorized<float> exp_u20() const {
+    return exp();
+  }
+  Vectorized<float> fmod(const Vectorized<float>& q) const {
+   USE_SLEEF({return Vectorized<float>(Sleef_fmodfx_sve(values, q));},
+    {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_q[size()];
+        store(tmp);
+        q.store(tmp_q);
+        for (int64_t i = 0; i < size(); ++i) {
+          tmp[i] = std::fmod(tmp[i], tmp_q[i]);
+        }
+        return loadu(tmp);
+      })
+  }
+  Vectorized<float> hypot(const Vectorized<float> &b) const {
+   USE_SLEEF( {return Vectorized<float>(Sleef_hypotfx_u05sve(values, b));},
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::hypot(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      }
+      )
+  }
+  Vectorized<float> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<float> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<float> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<float> igamma(const Vectorized<float> &x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> igammac(const Vectorized<float> &x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> nextafter(const Vectorized<float> &b) const {
+    USE_SLEEF(
+      {
+        return Vectorized<float>(Sleef_nextafterfx_sve(values, b));
+      },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); ++i) {
+          tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      }
+    )
+  }
+  Vectorized<float> log() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_logfx_u10sve(values)),map(std::log));
+  }
+  Vectorized<float> log2() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_log2fx_u10sve(values)),map(std::log2));
+  }
+  Vectorized<float> log10() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_log10fx_u10sve(values)),map(std::log10));
+  }
+  Vectorized<float> log1p() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_log1pfx_u10sve(values)),map(std::log1p));
+  }
+  Vectorized<float> frac() const;
+  Vectorized<float> sin() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_sinfx_u10sve(values)),map(std::sin));
+  }
+  Vectorized<float> sinh() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_sinhfx_u10sve(values)),map(std::sinh));
+  }
+  Vectorized<float> cos() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_cosfx_u10sve(values)),map(std::cos));
+  }
+  Vectorized<float> cosh() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_coshfx_u10sve(values)),map(std::cosh));
+  }
+  Vectorized<float> ceil() const {
+    return svrintp_f32_x(ptrue, values);
+  }
+  Vectorized<float> floor() const {
+    return svrintm_f32_x(ptrue, values);
+  }
+  Vectorized<float> neg() const {
+    return svneg_f32_x(ptrue, values);
+  }
+  Vectorized<float> round() const {
+    return svrinti_f32_x(ptrue, values);
+  }
+  Vectorized<float> tan() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_tanfx_u10sve(values)),map(std::tan));
+  }
+  Vectorized<float> tanh() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_tanhfx_u10sve(values)),map(std::tanh));
+  }
+  Vectorized<float> trunc() const {
+    return svrintz_f32_x(ptrue, values);
+  }
+  Vectorized<float> lgamma() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_lgammafx_u10sve(values)),map(std::lgamma));
+  }
+  Vectorized<float> sqrt() const {
+    return svsqrt_f32_x(ptrue, values);
+  }
+  Vectorized<float> reciprocal() const {
+    return svdivr_f32_x(ptrue, values, ONE_F32);
+  }
+  Vectorized<float> rsqrt() const {
+    return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, values), ONE_F32);
+  }
+  Vectorized<float> pow(const Vectorized<float> &b) const {
+   USE_SLEEF( {return Vectorized<float>(Sleef_powfx_u10sve(values, b));},
+    {
+      __at_align__ float tmp[size()];
+      __at_align__ float tmp_b[size()];
+      store(tmp);
+      b.store(tmp_b);
+      for (int64_t i = 0; i < size(); i++) {
+        tmp[i] = std::pow(tmp[i], tmp_b[i]);
+      }
+      return loadu(tmp);
+    }
+   )
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<float> operator==(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpeq_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator!=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpne_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator<(const Vectorized<float>& other) const {
+    svbool_t mask = svcmplt_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator<=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmple_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator>(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpgt_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator>=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpge_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> eq(const Vectorized<float>& other) const;
+  Vectorized<float> ne(const Vectorized<float>& other) const;
+  Vectorized<float> gt(const Vectorized<float>& other) const;
+  Vectorized<float> ge(const Vectorized<float>& other) const;
+  Vectorized<float> lt(const Vectorized<float>& other) const;
+  Vectorized<float> le(const Vectorized<float>& other) const;
+};
+
+template <>
+Vectorized<float> inline operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svadd_f32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<float> inline operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svsub_f32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<float> inline operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svmul_f32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<float> inline operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svdiv_f32_x(ptrue, a, b);
+}
+
+// frac. Implement this here so we can use subtraction
+Vectorized<float> inline Vectorized<float>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svmax_f32_x(ptrue, a, b);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svmin_f32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<float> inline clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
+  return svmin_f32_x(ptrue, max, svmax_f32_x(ptrue, min, a));
+}
+
+template <>
+Vectorized<float> inline clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
+  return svmin_f32_x(ptrue, max, a);
+}
+
+template <>
+Vectorized<float> inline clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
+  return svmax_f32_x(ptrue, min, a);
+}
+
+template <>
+Vectorized<float> inline operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+}
+
+template <>
+Vectorized<float> inline operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+}
+
+template <>
+Vectorized<float> inline operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+}
+
+Vectorized<float> inline Vectorized<float>::eq(const Vectorized<float>& other) const {
+  return (*this == other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::ne(const Vectorized<float>& other) const {
+  return (*this != other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::gt(const Vectorized<float>& other) const {
+  return (*this > other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::ge(const Vectorized<float>& other) const {
+  return (*this >= other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::lt(const Vectorized<float>& other) const {
+  return (*this < other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::le(const Vectorized<float>& other) const {
+  return (*this <= other) & Vectorized<float>(1.0f);
+}
+
+template <>
+inline void convert(const float* src, float* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<float>::size();
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+    svst1_f32(ptrue, dst + i, svldnt1_f32(ptrue, src + i));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+    svbool_t pg = svwhilelt_b32(i, n);
+    svst1_f32(pg, dst + i, svldnt1_f32(pg, src + i));
+  }
+}
+
+template <>
+inline void convert(const float *src, at::Half *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<float>::size();
+  svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+    svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)),
+                                    ZERO_F16);
+    svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+    pg_16 = svwhilelt_b16(i, n);
+    pg_32 = svwhilelt_b32(i, n);
+    svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)),
+                                     ZERO_F16);
+    svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
+  }
+}
+
+template <>
+inline void convert(const at::Half *src, float *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<float>::size();
+  svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+    svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
+                                    ZERO_F16);
+    svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
+  }
+#pragma unroll
+  for (int64_t i =  n - fraction; i < n; i += Vectorized<float>::size()) {
+    pg_16 = svwhilelt_b16(i, n);
+    pg_32 = svwhilelt_b32(i, n);
+    svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
+                                     ZERO_F16);
+    svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
+  }
+}
+
+template <>
+inline void convert(const bool *src, float *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<float>::size();
+  svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<float>::size());
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+    svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
+    svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
+    svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+    pg_8 = svwhilelt_b8(i, n);
+    pg_32 = svwhilelt_b32(i, n);
+    svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
+    svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
+    svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
+  }
+}
+
+template <>
+Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+  return svmad_f32_x(ptrue, a, b, c);
+}
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+}}}
--- a/aten/src/ATen/cpu/vec/sve/vec_int.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_int.h
@ -0,0 +1,410 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/sve/sve_helper.h>
+
+namespace at {
+namespace vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+#define VEC_INT_SVE_TEMPLATE(vl, bit)                                                                   \
+template <> class Vectorized<int##bit##_t> {                                                            \
+private:                                                                                                \
+  vls_int##bit##_t values;                                                                              \
+public:                                                                                                 \
+  using value_type = int##bit##_t;                                                                      \
+  using size_type = int;                                                                                \
+  static constexpr size_type size() {                                                                   \
+    return vl;                                                                                          \
+  }                                                                                                     \
+  Vectorized() {}                                                                                       \
+  Vectorized(svint##bit##_t v) : values(v) {}                                                           \
+  Vectorized(int##bit##_t val) {                                                                        \
+    values = svdup_n_s##bit(val);                                                                       \
+  }                                                                                                     \
+  template<typename... Args,                                                                            \
+           typename = std::enable_if_t<(sizeof...(Args) == size())>>                                    \
+  Vectorized(Args... vals) {                                                                            \
+    __at_align__ int##bit##_t buffer[size()] = { vals... };                                             \
+    values = svld1_s##bit(ptrue, buffer);                                                               \
+  }                                                                                                     \
+  operator svint##bit##_t() const {                                                                     \
+    return values;                                                                                      \
+  }                                                                                                     \
+  static Vectorized<int##bit##_t> blendv(const Vectorized<int##bit##_t>& a,                             \
+                                        const Vectorized<int##bit##_t>& b,                             \
+                                        const Vectorized<int##bit##_t>& mask_) {                       \
+    svbool_t mask = svcmpeq_s##bit(ptrue, mask_, ALL_S##bit##_TRUE_MASK);                               \
+    return svsel_s##bit(mask, b, a);                                                                    \
+  }                                                                                                     \
+  /* step sometimes requires a higher precision type (e.g., T=int, step_t=double) */                    \
+  template <typename step_t>                                              \
+  static Vectorized<int##bit##_t> arange(int##bit##_t base = 0, step_t step = static_cast<step_t>(1)) { \
+    __at_align__ int##bit##_t buffer[size()];                                                           \
+    for (int64_t i = 0; i < size(); i++) {                                                              \
+      buffer[i] = base + i * step;                                                                      \
+    }                                                                                                   \
+    return svld1_s##bit(ptrue, buffer);                                                                 \
+  }                                                                                                     \
+  static Vectorized<int##bit##_t> set(const Vectorized<int##bit##_t>& a,                                \
+                                     const Vectorized<int##bit##_t>& b,                                \
+                                     int##bit##_t count = size()) {                                    \
+    if (count == 0) {                                                                                   \
+      return a;                                                                                         \
+    } else if (count < size()) {                                                                        \
+      return svsel_s##bit(svwhilelt_b##bit(0ull, count), b, a);                                         \
+    }                                                                                                   \
+    return b;                                                                                           \
+  }                                                                                                     \
+  static Vectorized<int##bit##_t> loadu(const void* ptr, int64_t count = size()) {                      \
+    if (count == size())                                                                                \
+      return svld1_s##bit(ptrue, reinterpret_cast<const int##bit##_t*>(ptr));                           \
+    svbool_t pg = svwhilelt_b##bit(0ull, count);                                                        \
+    return svld1_s##bit(pg, reinterpret_cast<const int##bit##_t*>(ptr));                                \
+  }                                                                                                     \
+  void store(void* ptr, int64_t count = size()) const {                                                 \
+    if (count == size()) {                                                                              \
+      svst1_s##bit(ptrue, reinterpret_cast<int##bit##_t*>(ptr), values);                                \
+    } else {                                                                                            \
+      svbool_t pg = svwhilelt_b##bit(0ull, count);                                                      \
+      svst1_s##bit(pg, reinterpret_cast<int##bit##_t*>(ptr), values);                                   \
+    }                                                                                                   \
+  }                                                                                                     \
+  const int##bit##_t& operator[](int idx) const  = delete;                                              \
+  int##bit##_t& operator[](int idx) = delete;                                                           \
+  Vectorized<int##bit##_t> abs() const {                                                                \
+    return svabs_s##bit##_x(ptrue, values);                                                             \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> real() const {                                                               \
+    return values;                                                                                      \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> imag() const {                                                               \
+    return svdup_n_s##bit(0);                                         \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> conj() const {                                                               \
+    return values;                                                                                      \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> frac() const;                                                                \
+  Vectorized<int##bit##_t> neg() const {                                                                \
+    return svneg_s##bit##_x(ptrue, values);                                                             \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> operator==(const Vectorized<int##bit##_t>& other) const {                    \
+    svbool_t mask = svcmpeq_s##bit(ptrue, values, other);                                               \
+    return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);                         \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> operator!=(const Vectorized<int##bit##_t>& other) const {                    \
+    svbool_t mask = svcmpne_s##bit(ptrue, values, other);                                               \
+    return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);                         \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> operator<(const Vectorized<int##bit##_t>& other) const {                     \
+    svbool_t mask = svcmplt_s##bit(ptrue, values, other);                                               \
+    return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);                         \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> operator<=(const Vectorized<int##bit##_t>& other) const {                    \
+    svbool_t mask = svcmple_s##bit(ptrue, values, other);                                               \
+    return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);                         \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> operator>(const Vectorized<int##bit##_t>& other) const {                     \
+    svbool_t mask = svcmpgt_s##bit(ptrue, values, other);                                               \
+    return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);                         \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> operator>=(const Vectorized<int##bit##_t>& other) const {                    \
+    svbool_t mask = svcmpge_s##bit(ptrue, values, other);                                               \
+    return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);                         \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> eq(const Vectorized<int##bit##_t>& other) const;                             \
+  Vectorized<int##bit##_t> ne(const Vectorized<int##bit##_t>& other) const;                             \
+  Vectorized<int##bit##_t> gt(const Vectorized<int##bit##_t>& other) const;                             \
+  Vectorized<int##bit##_t> ge(const Vectorized<int##bit##_t>& other) const;                             \
+  Vectorized<int##bit##_t> lt(const Vectorized<int##bit##_t>& other) const;                             \
+  Vectorized<int##bit##_t> le(const Vectorized<int##bit##_t>& other) const;                             \
+};                                                                                                      \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline operator+(const Vectorized<int##bit##_t>& a,                            \
+                                          const Vectorized<int##bit##_t>& b) {       \
+  return svadd_s##bit##_x(ptrue, a, b);                                                                 \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline operator-(const Vectorized<int##bit##_t>& a,                            \
+                                         const Vectorized<int##bit##_t>& b) {                          \
+  return svsub_s##bit##_x(ptrue, a, b);                                                                 \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline operator*(const Vectorized<int##bit##_t>& a,                            \
+                                         const Vectorized<int##bit##_t>& b) {                          \
+  return svmul_s##bit##_x(ptrue, a, b);                                                                \
+}                                                                                                       \
+template <>                                                 \
+Vectorized<int##bit##_t> inline maximum(const Vectorized<int##bit##_t>& a,                              \
+                                       const Vectorized<int##bit##_t>& b) {                            \
+  return svmax_s##bit##_x(ptrue, a, b);                                                                 \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline minimum(const Vectorized<int##bit##_t>& a,                              \
+                                       const Vectorized<int##bit##_t>& b) {                          \
+  return svmin_s##bit##_x(ptrue, a, b);                                                                 \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline clamp(const Vectorized<int##bit##_t>& a,                                \
+                                     const Vectorized<int##bit##_t>& min,                              \
+                                     const Vectorized<int##bit##_t>& max) {                            \
+  return svmin_s##bit##_x(ptrue, max, svmax_s##bit##_x(ptrue, min, a));                                 \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline clamp_max(const Vectorized<int##bit##_t>& a,                            \
+                                         const Vectorized<int##bit##_t>& max) {                        \
+  return svmin_s##bit##_x(ptrue, max, a);                                                               \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline clamp_min(const Vectorized<int##bit##_t>& a,                            \
+                                         const Vectorized<int##bit##_t>& min) {                        \
+  return svmax_s##bit##_x(ptrue, min, a);                                                               \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline operator&(const Vectorized<int##bit##_t>& a,                            \
+                                         const Vectorized<int##bit##_t>& b) {                          \
+  return svand_s##bit##_x(ptrue, a, b);                                                                 \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline operator|(const Vectorized<int##bit##_t>& a,                            \
+                                         const Vectorized<int##bit##_t>& b) {                          \
+  return svorr_s##bit##_x(ptrue, a, b);                                                                 \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline operator^(const Vectorized<int##bit##_t>& a,                            \
+                                         const Vectorized<int##bit##_t>& b) {                          \
+  return sveor_s##bit##_x(ptrue, a, b);                                                                 \
+}                                                                                                       \
+template <>                                                                                             \
+inline Vectorized<int##bit##_t> operator~(const Vectorized<int##bit##_t>& a) {                          \
+  return sveor_s##bit##_x(ptrue, a, svdup_n_s##bit(-1));                                                \
+}                                                                                                       \
+Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::eq(const Vectorized<int##bit##_t>& other) const {    \
+  return (*this == other) & Vectorized<int##bit##_t>(1);                                                \
+}                                                                                                       \
+Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ne(const Vectorized<int##bit##_t>& other) const {    \
+  return (*this != other) & Vectorized<int##bit##_t>(1);                                                \
+}                                                                                                       \
+Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::gt(const Vectorized<int##bit##_t>& other) const {    \
+  return (*this > other) & Vectorized<int##bit##_t>(1);                                                 \
+}                                                                                                       \
+Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ge(const Vectorized<int##bit##_t>& other) const {    \
+  return (*this >= other) & Vectorized<int##bit##_t>(1);                                                \
+}                                                                                                       \
+Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::lt(const Vectorized<int##bit##_t>& other) const {    \
+  return (*this < other) & Vectorized<int##bit##_t>(1);                                                 \
+}                                                                                                       \
+Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::le(const Vectorized<int##bit##_t>& other) const {    \
+  return (*this <= other) & Vectorized<int##bit##_t>(1);                                                \
+}
+
+VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int64_t), 64)
+VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int32_t), 32)
+VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int16_t), 16)
+VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int8_t), 8)
+
+template <typename T>
+Vectorized<T> inline intdiv_nosve(const Vectorized<T>& a, const Vectorized<T>& b) {
+  T values_a[Vectorized<T>::size()];
+  T values_b[Vectorized<T>::size()];
+  a.store(values_a);
+  b.store(values_b);
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    values_a[i] /= values_b[i];
+  }
+  return Vectorized<T>::loadu(values_a);
+}
+
+template <>
+Vectorized<int64_t> inline operator/(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return svdiv_s64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator/(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return svdiv_s32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator/(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return intdiv_nosve(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator/(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return intdiv_nosve(a, b);
+}
+
+template <>
+inline void convert(const int32_t *src, int64_t *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int64_t>::size();
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int64_t>::size());
+  svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int64_t>::size())
+    svst1_s64(pg_64, dst + i, svunpklo_s64(svldnt1_s32(pg_32, src + i)));
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int64_t>::size()) {
+    pg_32 = svwhilelt_b32(i, n);
+    pg_64 = svwhilelt_b64(i, n);
+    svst1_s64(pg_64, dst + i, svunpklo_s64(svldnt1_s32(pg_32, src + i)));
+  }
+}
+
+template <>
+inline void convert(const int64_t *src, float *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int64_t>::size();
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int64_t>::size());
+  svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int64_t>::size()) {
+    svint64_t src_vec_s64 = svldnt1_s64(pg_64, src + i);
+    svfloat32_t src_vec_f32 = svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32);
+    svst1_f32(pg_32, dst + i, src_vec_f32);
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int64_t>::size()) {
+    pg_32 = svwhilelt_b32(i, n);
+    pg_64 = svwhilelt_b64(i, n);
+    svint64_t src_vec_s64 = svldnt1_s64(pg_64, src + i);
+    svfloat32_t src_vec_f32 = svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32);
+    svst1_f32(pg_32, dst + i, src_vec_f32);
+  }
+}
+
+template <>
+inline void convert(const int32_t *src, float *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int32_t>::size();
+  svbool_t pg = svwhilelt_b32(0ull, Vectorized<int32_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int32_t>::size()) {
+    svint32_t src_vec = svldnt1_s32(pg, src + i);
+    svst1_f32(pg, dst + i, svcvt_f32_s32_x(pg, src_vec));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int32_t>::size()) {
+    pg = svwhilelt_b32(i, n);
+    svint32_t src_vec = svldnt1_s32(pg, src + i);
+    svst1_f32(pg, dst + i, svcvt_f32_s32_x(pg, src_vec));
+  }
+}
+
+template <>
+inline void convert(const bool *src, int64_t *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int64_t>::size();
+  svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<int64_t>::size());
+  svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int64_t>::size()) {
+    svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint64_t src_vec_u64 = svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8)));
+    svbool_t mask = svcmpne_u64(pg_64, src_vec_u64, ZERO_U64);
+    svst1_s64(pg_64, dst + i, svsel_s64(mask, ONE_S64, ZERO_S64));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int64_t>::size()) {
+    pg_8 = svwhilelt_b8(i, n);
+    pg_64 = svwhilelt_b64(i, n);
+    svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint64_t src_vec_u64 = svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8)));
+    svbool_t mask = svcmpne_u64(pg_64, src_vec_u64, ZERO_U64);
+    svst1_s64(pg_64, dst + i, svsel_s64(mask, ONE_S64, ZERO_S64));
+  }
+}
+
+template <>
+inline void convert(const bool *src, int32_t *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int32_t>::size();
+  svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<int32_t>::size());
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int32_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int32_t>::size()) {
+    svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
+    svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
+    svst1_s32(pg_32, dst + i, svsel_s32(mask, ONE_S32, ZERO_S32));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int32_t>::size()) {
+    pg_8 = svwhilelt_b8(i, n);
+    pg_32 = svwhilelt_b32(i, n);
+    svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
+    svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
+    svst1_s32(pg_32, dst + i, svsel_s32(mask, ONE_S32, ZERO_S32));
+  }
+}
+
+template <>
+inline void convert(const uint8_t *src, bool *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<uint8_t>::size();
+  svbool_t pg = svwhilelt_b8(0ull, Vectorized<uint8_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<uint8_t>::size()) {
+    svbool_t mask = svcmpne_u8(pg, svldnt1_u8(pg, src + i), ZERO_U8);
+    svst1_u8(pg, reinterpret_cast<uint8_t*>(dst) + i,
+            svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<uint8_t>::size()) {
+    pg = svwhilelt_b8(i, n);
+    svbool_t mask = svcmpne_u8(pg, svldnt1_u8(pg, src + i), ZERO_U8);
+    svst1_u8(pg, reinterpret_cast<uint8_t*>(dst) + i,
+             svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK));
+  }
+}
+
+template <>
+Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return svlsl_s64_x(ptrue, a, svreinterpret_u64_s64(b));
+}
+
+template <>
+Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return svlsl_s32_x(ptrue, a, svreinterpret_u32_s32(b));
+}
+
+template <>
+Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return svlsl_s16_x(ptrue, a, svreinterpret_u16_s16(b));
+}
+
+template <>
+Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return svlsl_s8_x(ptrue, a, svreinterpret_u8_s8(b));
+}
+
+template <>
+Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return svasr_s64_x(ptrue, a, svreinterpret_u64_s64(b));
+}
+
+template <>
+Vectorized<int32_t> inline operator>>(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return svasr_s32_x(ptrue, a, svreinterpret_u32_s32(b));
+}
+
+template <>
+Vectorized<int16_t> inline operator>>(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return svasr_s16_x(ptrue, a, svreinterpret_u16_s16(b));
+}
+
+template <>
+Vectorized<int8_t> inline operator>>(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return svasr_s8_x(ptrue, a, svreinterpret_u8_s8(b));
+}
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+}}}
--- a/aten/src/ATen/cpu/vec/sve/vec_qint.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_qint.h
@ -0,0 +1,567 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with SVE]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/native/quantized/AffineQuantizerBase.h>
+#include <c10/util/qint32.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint8.h>
+
+#include <array>
+
+// This file defines Vectorized<> for the quantized types.
+//
+//
+// Currently, we simply use these classes as efficient converters between
+// the quantized types and Vectorized<float>, usually in bandwidth-bound cases
+// where doing the arithmetic in full-precision is acceptable (e.g.
+// elementwise operators).
+//
+//
+// Conversions are as follows:
+//  Vectorized<qint8> -> 4x Vectorized<float>
+//  Vectorized<quint8> -> 4x Vectorized<float>
+//  Vectorized<qint32> -> 1x Vectorized<float>
+//
+// The size of the returned float vector is specified by the special
+// constexpr function float_num_vecs. The type of the value returned
+// from dequantize (and expected as an argument to quantize) is
+// specified by float_vec_return_type.
+//
+// When writing kernels with these vectors, it is expected that floating-
+// point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
+// iterations.
+
+namespace at {
+namespace vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+// NOTE: These are low-performance implementations that we fall back on
+// if we are not building with SVE. This may not be an issue, because
+// currently for quantization we assume the user has at least SVE
+// installed, so these can simply act as a reference implementation.
+//
+// If in the future we relax this requirement (SVE+), we should probably
+// revisit these implementations
+
+template <
+    typename T,
+    typename float_vec_return_type_,
+    typename int_vec_return_type_,
+    int size_>
+struct VectorizedQuantizedConverter {
+  using size_type = int;
+  static constexpr size_type size() {
+    return size_;
+  }
+
+  static constexpr int float_num_vecs() {
+    return size() / Vectorized<float>::size();
+  }
+
+  static constexpr int int_num_vecs() {
+    return size() / Vectorized<int32_t>::size();
+  }
+
+  using float_vec_return_type = float_vec_return_type_;
+  using int_vec_return_type = int_vec_return_type_;
+
+  using value_type = typename T::underlying;
+  std::array<value_type, size_> vals;
+
+  VectorizedQuantizedConverter(T val) {
+    for (size_t i = 0; i < size(); ++i) {
+      vals[i] = val.val_;
+    }
+  }
+
+  VectorizedQuantizedConverter(const void* ptr) {
+    memcpy(vals.data(), ptr, sizeof(value_type) * size());
+  }
+
+  void store(void* ptr, int count = size()) const {
+    memcpy(ptr, vals.data(), count * sizeof(value_type));
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    float_vec_return_type rv;
+    float tmp_scale[Vectorized<float>::size()];
+    float tmp_zero_point[Vectorized<float>::size()];
+    scale.store(tmp_scale);
+    zero_point.store(tmp_zero_point);
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      float tmp_vals[Vectorized<float>::size()];
+      for (int j = 0; j < Vectorized<float>::size(); ++j) {
+        tmp_vals[j] =
+          at::native::dequantize_val<T>(tmp_scale[j], tmp_zero_point[j], T(vals[Vectorized<float>::size() * i + j]));
+      }
+      rv[i] = Vectorized<float>::loadu(tmp_vals);
+    }
+    return rv;
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    float_vec_return_type rv;
+    float tmp_scale[Vectorized<float>::size()];
+    float tmp_zero_point[Vectorized<float>::size()];
+    scale.store(tmp_scale);
+    zero_point.store(tmp_zero_point);
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      float tmp_vals[Vectorized<float>::size()];
+      for (int j = 0; j < Vectorized<float>::size(); ++j) {
+        tmp_vals[j] =
+          at::native::dequantize_val<T>(tmp_scale[j], tmp_zero_point[j], T(vals[Vectorized<float>::size() * i + j]));
+      }
+      rv[i] = Vectorized<float>::loadu(tmp_vals);
+    }
+    return rv;
+  }
+
+ protected:
+  VectorizedQuantizedConverter() {}
+};
+
+template <>
+struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
+                                 c10::qint32,
+                                 std::array<Vectorized<float>, 1>,
+                                 std::array<Vectorized<c10::qint32>, 1>,
+                                 VECTOR_WIDTH / 4> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            VECTOR_WIDTH / 4>() {}
+  Vectorized(c10::qint32 val)
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            VECTOR_WIDTH / 4>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            VECTOR_WIDTH / 4>(ptr) {}
+#if 1
+  static Vectorized<c10::qint32> loadu(const void* ptr) {
+    return Vectorized<c10::qint32>(ptr);
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
+      __at_align__ value_type tmp_values[size()];
+      // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+      // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+      // instructions while a loop would be compiled to one instruction.
+      for (const auto i : c10::irange(size())) {
+        tmp_values[i] = 0;
+      }
+      std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+      return loadu(tmp_values);
+  }
+#else
+  static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return svld1_s32(ptrue, reinterpret_cast<const int32_t*>(ptr));
+    svbool_t pg = svwhilelt_b32(0ull, count);
+    return svld1_s32(pg, reinterpret_cast<const int32_t*>(ptr));
+  }
+#endif
+  static Vectorized<c10::qint32> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      rhs[i].store(&float_vals[i * Vectorized<float>::size()], Vectorized<float>::size());
+    }
+
+    at::native::quantize_vec<c10::qint32, /*precision=*/32>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::qint32*)qvals.data(),
+        Vectorized<float>::size() * float_num_vecs());
+
+    return Vectorized<c10::qint32>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
+    Vectorized<c10::qint32> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
+    Vectorized<c10::qint32> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const  {
+    return maximum(zero_point);
+  }
+
+
+  Vectorized<c10::qint32> relu6(
+      Vectorized<c10::qint32> zero_point,
+      Vectorized<c10::qint32> q_six) {
+    Vectorized<c10::qint32> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+    int_vec_return_type retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval[0].vals[i] = vals[i] - b.vals[i];
+    }
+    return retval;
+  }
+
+  static Vectorized<c10::qint32> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    Vectorized<c10::qint32> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] =
+          nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
+          zero_point;
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator*(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  Vectorized<c10::qint32> retval;
+  for (size_t i = 0; i < std::decay_t<decltype(a)>::size(); ++i) {
+    retval.vals[i] = a.vals[i] * b.vals[i];
+  }
+  return retval;
+}
+
+template <>
+Vectorized<c10::qint32> inline operator+(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  Vectorized<c10::qint32> retval;
+  for (size_t i = 0; i < std::decay_t<decltype(a)>::size(); ++i) {
+    retval.vals[i] = a.vals[i] + b.vals[i];
+  }
+  return retval;
+}
+
+template <>
+struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
+                                c10::qint8,
+                                std::array<Vectorized<float>, 4>,
+                                std::array<Vectorized<c10::qint32>, 4>,
+                                VECTOR_WIDTH> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>() {}
+  Vectorized(c10::qint8 val)
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>(ptr) {}
+
+  static Vectorized<c10::qint8> loadu(const void* ptr) {
+    return Vectorized<c10::qint8>(ptr);
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
+      __at_align__ value_type tmp_values[size()];
+      // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+      // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+      // instructions while a loop would be compiled to one instruction.
+      for (const auto i : c10::irange(size())) {
+        tmp_values[i] = 0;
+      }
+      std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+      return loadu(tmp_values);
+  }
+
+  static Vectorized<c10::qint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      rhs[i].store(&float_vals[i * Vectorized<float>::size()], Vectorized<float>::size());
+    }
+
+    at::native::quantize_vec<c10::qint8>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::qint8*)qvals.data(),
+        Vectorized<float>::size() * float_num_vecs());
+
+    return Vectorized<c10::qint8>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
+    Vectorized<c10::qint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
+    Vectorized<c10::qint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint8> relu6(
+      Vectorized<c10::qint8> zero_point,
+      Vectorized<c10::qint8> q_six) {
+    Vectorized<c10::qint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+    int_vec_return_type retval;
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    for (size_t i = 0; i < int_num_vecs(); ++i) {
+      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+        retval[i].vals[j] =
+            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
+            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
+      }
+    }
+    return retval;
+  }
+  static Vectorized<c10::qint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    constexpr auto min_val = std::numeric_limits<value_type>::min();
+    constexpr auto max_val = std::numeric_limits<value_type>::max();
+    Vectorized<c10::qint8> retval;
+    for (size_t i = 0; i < int_num_vecs(); ++i) {
+      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+        int32_t rounded =
+            nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
+            zero_point;
+        retval.vals[i * elem_per_int_vec + j] =
+            std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
+      }
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
+                                 c10::quint8,
+                                 std::array<Vectorized<float>, 4>,
+                                 std::array<Vectorized<c10::qint32>, 4>,
+                                 VECTOR_WIDTH> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>() {}
+  Vectorized(c10::quint8 val)
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>(ptr) {}
+#if 1
+  static Vectorized<c10::quint8> loadu(const void* ptr) {
+    return Vectorized<c10::quint8>(ptr);
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
+      __at_align__ value_type tmp_values[size()];
+      // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+      // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+      // instructions while a loop would be compiled to one instruction.
+      for (const auto i : c10::irange(size())) {
+        tmp_values[i] = 0;
+      }
+      std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+      return loadu(tmp_values);
+  }
+#else
+  static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return svld1_u8(ptrue, reinterpret_cast<const uint8_t*>(ptr));
+    svbool_t pg = svwhilelt_b8(0ull, count);
+    return svld1_u8(pg, reinterpret_cast<const uint8_t*>(ptr));
+  }
+#endif
+  static Vectorized<c10::quint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      rhs[i].store(&float_vals[i * Vectorized<float>::size()], Vectorized<float>::size());
+    }
+
+    at::native::quantize_vec<c10::quint8>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::quint8*)qvals.data(),
+        Vectorized<float>::size() * float_num_vecs());
+
+    return Vectorized<c10::quint8>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
+    Vectorized<c10::quint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
+    Vectorized<c10::quint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+
+  Vectorized<c10::quint8> relu6(
+      Vectorized<c10::quint8> zero_point,
+      Vectorized<c10::quint8> q_six) {
+    Vectorized<c10::quint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+    int_vec_return_type retval;
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    for (size_t i = 0; i < int_num_vecs(); ++i) {
+      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+        retval[i].vals[j] =
+            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
+            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
+      }
+    }
+    return retval;
+  }
+  static Vectorized<c10::quint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    constexpr auto min_val = std::numeric_limits<value_type>::min();
+    constexpr auto max_val = std::numeric_limits<value_type>::max();
+    Vectorized<c10::quint8> retval;
+    for (size_t i = 0; i < int_num_vecs(); ++i) {
+      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+        int32_t rounded =
+            nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
+            zero_point;
+        retval.vals[i * elem_per_int_vec + j] =
+            std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
+      }
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return a.maximum(b);
+}
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+}}}
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@ -7,9 +7,13 @@

 #include <ATen/cpu/vec/vec_base.h>
 #if !(defined(__VSX__)  || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_ZVECTOR))
-#include <ATen/cpu/vec/vec256/vec256_float.h>
+#if defined(CPU_CAPABILITY_SVE256)
+#include <ATen/cpu/vec/sve/vec_common_sve.h>
+#else
 #include <ATen/cpu/vec/vec256/vec256_float_neon.h>
 #include <ATen/cpu/vec/vec256/vec256_half_neon.h>
+#endif
+#include <ATen/cpu/vec/vec256/vec256_float.h>
 #include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
 #include <ATen/cpu/vec/vec256/vec256_double.h>
 #include <ATen/cpu/vec/vec256/vec256_int.h>
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@ -1097,7 +1097,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
  return Vectorized<type>::loadu(arr2); \
 }
 CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE256)
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_half_float(const Vectorized<Half>& a) {
  static_assert(Vectorized<Half>::size() == 2 * Vectorized<float>::size());
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
--- a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@ -208,8 +208,27 @@ struct VecConvert<
            (is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
        void>> {
  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
-    VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
-    return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
+    VectorizedN<float, 2> tmp_fp32 = VecConvert<float, 2, src_t, 1>::apply(src);
+    return VecConvert<dst_t, 1, float, 2>::apply(tmp_fp32);
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    2,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>,
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 2>& src) {
+    at::vec::Vectorized<dst_t> vec1 = convert_float_to_int8<dst_t>(src[0]);
+    at::vec::Vectorized<dst_t> vec2 = convert_float_to_int8<dst_t>(src[1]);
+    __m128 lane2 = _mm256_castps256_ps128(_mm256_castsi256_ps(vec2));
+    __m256 combined = _mm256_insertf128_ps(_mm256_castsi256_ps(vec1), lane2, 1);
+    // Shuffle [191:128] bit from combined in to [127:64] bit of result
+    __m256i result = _mm256_permute4x64_epi64(_mm256_castps_si256(combined), 0b11011000);
+    return at::vec::Vectorized<dst_t>(result);
  }
 };

@ -226,6 +245,25 @@ struct VecConvert<
  }
 };

+template <typename src_t>
+struct VecConvert<
+    float,
+    2,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>,
+        void>> {
+  static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
+    // Shuffle [127:64] bit from src[0] in to [191:128] bit of shuffled
+    __m256i shuffled = _mm256_permute4x64_epi64(src[0], 0b11011000);
+    __m256i src2 = _mm256_castsi128_si256(
+      _mm_castps_si128(
+        _mm256_extractf128_ps(_mm256_castsi256_ps(shuffled), 1) // Extract the second 128-bit lane
+      )
+    );
+    return VectorizedN<float, 2>(convert_int8_to_float<src_t>(src[0]), convert_int8_to_float<src_t>(src2));
+  }
+};

 template <typename dst_t>
 struct VecConvert<
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@ -843,7 +843,7 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
  return a.maximum(b);
 }

-#else
+#elif !defined(CPU_CAPABILITY_SVE256)

 // NOTE: These are low-performance implementations that we fall back on
 // if we are not building with AVX2. This may not be an issue, because
--- a/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
@ -209,8 +209,25 @@ struct VecConvert<
            (is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
        void>> {
  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
-    VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
-    return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
+    VectorizedN<float, 2> tmp_fp32 = VecConvert<float, 2, src_t, 1>::apply(src);
+    return VecConvert<dst_t, 1, float, 2>::apply(tmp_fp32);
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    2,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>,
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 2>& src) {
+    at::vec::Vectorized<dst_t> vec1 = convert_float_to_int8<dst_t>(src[0]);
+    at::vec::Vectorized<dst_t> vec2 = convert_float_to_int8<dst_t>(src[1]);
+    __m128 lane2 = _mm512_castps512_ps128(_mm512_castsi512_ps(vec2));
+    __m512 result = _mm512_insertf32x4(_mm512_castsi512_ps(vec1), lane2, 1); // Insert lane2 into the second 128-bit lane
+    return at::vec::Vectorized<dst_t>(_mm512_castps_si512(result));
  }
 };

@ -227,6 +244,24 @@ struct VecConvert<
  }
 };

+template <typename src_t>
+struct VecConvert<
+    float,
+    2,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>,
+        void>> {
+  static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
+    __m512i src2 = _mm512_castsi128_si512(
+      _mm_castps_si128(
+        _mm512_extractf32x4_ps(_mm512_castsi512_ps(src[0]), 1) // Extract the second 128-bit lane
+      )
+    );
+    return VectorizedN<float, 2>(convert_int8_to_float<src_t>(src[0]), convert_int8_to_float<src_t>(src2));
+  }
+};
+
 template <typename src_t>
 struct VecConvert<
    float,
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -990,7 +990,7 @@ inline mask_gather(const Vectorized<T>& src, T const* base_addr,
      buffer[i] = src_arr[i];
    }
  }
-  mask = Vectorized<T>();  // "zero out" mask
+  mask = Vectorized<T>(static_cast<T>(0));  // "zero out" mask
  return Vectorized<T>::loadu(static_cast<void*>(buffer));
 }

--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -160,7 +160,7 @@ void CUDAGraph::capture_end() {

  c10::cuda::CUDACachingAllocator::endAllocateToPool(capture_dev_, mempool_id_);

-  TORCH_CHECK(graph_ != NULL, "Invalid capture.");
+  TORCH_CHECK(graph_ != nullptr, "Invalid capture.");
  has_graph_ = true;

  // In typical graph usage some tensors (e.g. the tensors used for graph IO) are not freed
@ -175,7 +175,7 @@ void CUDAGraph::capture_end() {
  // cudaGraphInstantiateWithFlags
  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233
 #if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040)
-  int version;
+  int version = 0;
  AT_CUDA_CHECK(cudaDriverGetVersion(&version));
  if (version < 11040) {
 #endif
@ -203,7 +203,7 @@ void CUDAGraph::capture_end() {
  }

  size_t numCUDAGraphNodes = 0;
-  AT_CUDA_CHECK(cudaGraphGetNodes(graph_, NULL, &numCUDAGraphNodes));
+  AT_CUDA_CHECK(cudaGraphGetNodes(graph_, nullptr, &numCUDAGraphNodes));
  if (numCUDAGraphNodes == 0) {
      TORCH_WARN("The CUDA Graph is empty. This usually means that the graph was ",
                 "attempted to be captured on wrong device or stream.");
@ -233,7 +233,7 @@ void CUDAGraph::replay() {
  // graph_exec_ may be replayed in any stream.
  AT_CUDA_CHECK(cudaGraphLaunch(graph_exec_, at::cuda::getCurrentCUDAStream()));

-  int version;
+  int version = 0;
  AT_CUDA_CHECK(cudaDriverGetVersion(&version));
  if (version < 11040) {
    // Workaround for bug in libcuda.so that causes replayed graphs with
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@ -82,7 +82,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
  // in a capture to run on the same device, but this is a limitation of CUDAGraph,
  // not CUDA itself.  We can straightforwardly modify CUDAGraph to support multi-device
  // captures if needed.
-  int capture_dev_;
+  int capture_dev_{};
 };

 } // namespace cuda
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@ -123,6 +123,11 @@ struct CUDACachingHostAllocatorImpl
    return true;
  }

+  bool pinned_use_background_threads() override {
+    return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
+        pinned_use_background_threads();
+  }
+
  EventPool::Event create_event_internal(DeviceIndex idx) {
    // Leak the event pool to avoid shutdown issue.
    static auto* event_pool = new EventPool();
--- a/aten/src/ATen/cuda/cub-RadixSortKeys.cu
+++ b/aten/src/ATen/cuda/cub-RadixSortKeys.cu
@ -50,7 +50,7 @@ void radix_sort_keys(
      int64_t begin_bit,                                  \
      int64_t end_bit);

-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTATIATE_CUB_TEMPLATES)
+AT_FORALL_SCALAR_TYPES_AND3(Bool, BFloat16, Half, AT_INSTATIATE_CUB_TEMPLATES)
 AT_INSTATIATE_CUB_TEMPLATES(uint16_t, UInt16)
 AT_INSTATIATE_CUB_TEMPLATES(uint32_t, UInt32)
 AT_INSTATIATE_CUB_TEMPLATES(uint64_t, UInt64)
--- a/aten/src/ATen/cuda/tunable/TunableOp.h
+++ b/aten/src/ATen/cuda/tunable/TunableOp.h
@ -278,7 +278,7 @@ class TunableOp {
 };

 struct OpParams {
-  OpParams() {}
+  OpParams() = default;
  virtual ~OpParams() = default;
  virtual std::string Signature() const = 0;
 };
--- a/aten/src/ATen/detail/MTIAHooksInterface.h
+++ b/aten/src/ATen/detail/MTIAHooksInterface.h
@ -104,6 +104,11 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
    FAIL_MTIAHOOKS_FUNC(__func__);
    return nullptr;
  }
+
+  virtual PyObject* getDeviceCapability(DeviceIndex device) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+    return nullptr;
+  }
 };

 struct TORCH_API MTIAHooksArgs {};
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@ -230,7 +230,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
  m.impl("reshape", native::reshape_symint);
  OP_DECOMPOSE(resolve_conj);
  OP_DECOMPOSE(resolve_neg);
-  OP_DECOMPOSE(rms_norm);
+  m.impl("rms_norm", native::rms_norm_symint);
  OP_DECOMPOSE(row_stack);
  OP_DECOMPOSE(rrelu);
  OP_DECOMPOSE(rrelu_);
--- a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
@ -297,7 +297,7 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_out_cpu)
  int64_t osizeW = output_size[2];

  if (input.ndimension() == 4) {
-    AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16,
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf,
        input.scalar_type(), "adaptive_max_pool3d_cpu", [&] {
          auto input_data = input.const_data_ptr<scalar_t>();
          auto output_data = output.data_ptr<scalar_t>();
@ -320,7 +320,7 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_out_cpu)
              istrideW);
        });
  } else {
-    AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16,
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf,
        input.scalar_type(), "adaptive_max_pool3d_cpu", [&] {
          auto input_data = input.const_data_ptr<scalar_t>();
          auto output_data = output.data_ptr<scalar_t>();
@ -390,7 +390,7 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_backward_out_cpu)

  /* backprop */
  if (input.ndimension() == 4) {
-    AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16,
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf,
        input.scalar_type(), "adaptive_max_pool3d_backward", [&] {
          /* get raw pointers */
          scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
@ -410,7 +410,7 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_backward_out_cpu)
              osizeW);
        });
  } else {
-    AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16,
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf,
        input.scalar_type(), "adaptive_max_pool3d_backward", [&] {
          /* get raw pointers */
          scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@ -1140,87 +1140,103 @@ REGISTER_AVX512_DISPATCH(cholesky_stub, &cholesky_kernel);
 REGISTER_AVX2_DISPATCH(cholesky_stub, &cholesky_kernel);
 REGISTER_VSX_DISPATCH(cholesky_stub, &cholesky_kernel);
 REGISTER_ZVECTOR_DISPATCH(cholesky_stub, &cholesky_kernel);
+REGISTER_SVE256_DISPATCH(cholesky_stub, &cholesky_kernel);

 REGISTER_ARCH_DISPATCH(cholesky_inverse_stub, DEFAULT, &cholesky_inverse_kernel_impl);
 REGISTER_AVX512_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);
 REGISTER_AVX2_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);
 REGISTER_VSX_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);
 REGISTER_ZVECTOR_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);
+REGISTER_SVE256_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);

 REGISTER_ARCH_DISPATCH(linalg_eig_stub, DEFAULT, &linalg_eig_kernel);
 REGISTER_AVX512_DISPATCH(linalg_eig_stub, &linalg_eig_kernel);
 REGISTER_AVX2_DISPATCH(linalg_eig_stub, &linalg_eig_kernel);
 REGISTER_VSX_DISPATCH(linalg_eig_stub, &linalg_eig_kernel);
 REGISTER_ZVECTOR_DISPATCH(linalg_eig_stub, &linalg_eig_kernel);
+REGISTER_SVE256_DISPATCH(linalg_eig_stub, &linalg_eig_kernel);

 REGISTER_ARCH_DISPATCH(linalg_eigh_stub, DEFAULT, &linalg_eigh_kernel);
 REGISTER_AVX512_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel);
 REGISTER_AVX2_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel);
 REGISTER_VSX_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel);
 REGISTER_ZVECTOR_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel);
+REGISTER_SVE256_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel);

 REGISTER_ARCH_DISPATCH(geqrf_stub, DEFAULT, &geqrf_kernel);
 REGISTER_AVX512_DISPATCH(geqrf_stub, &geqrf_kernel);
 REGISTER_AVX2_DISPATCH(geqrf_stub, &geqrf_kernel);
 REGISTER_VSX_DISPATCH(geqrf_stub, &geqrf_kernel);
 REGISTER_ZVECTOR_DISPATCH(geqrf_stub, &geqrf_kernel);
+REGISTER_SVE256_DISPATCH(geqrf_stub, &geqrf_kernel);

 REGISTER_ARCH_DISPATCH(orgqr_stub, DEFAULT, &orgqr_kernel_impl);
 REGISTER_AVX512_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
 REGISTER_AVX2_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
 REGISTER_VSX_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
 REGISTER_ZVECTOR_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
+REGISTER_SVE256_DISPATCH(orgqr_stub, &orgqr_kernel_impl);

 REGISTER_ARCH_DISPATCH(ormqr_stub, DEFAULT, &ormqr_kernel);
 REGISTER_AVX512_DISPATCH(ormqr_stub, &ormqr_kernel);
 REGISTER_AVX2_DISPATCH(ormqr_stub, &ormqr_kernel);
 REGISTER_VSX_DISPATCH(ormqr_stub, &ormqr_kernel);
 REGISTER_ZVECTOR_DISPATCH(ormqr_stub, &ormqr_kernel);
+REGISTER_SVE256_DISPATCH(ormqr_stub, &ormqr_kernel);

 REGISTER_ARCH_DISPATCH(lstsq_stub, DEFAULT, &lstsq_kernel);
 REGISTER_AVX512_DISPATCH(lstsq_stub, &lstsq_kernel);
 REGISTER_AVX2_DISPATCH(lstsq_stub, &lstsq_kernel);
 REGISTER_VSX_DISPATCH(lstsq_stub, &lstsq_kernel);
 REGISTER_ZVECTOR_DISPATCH(lstsq_stub, &lstsq_kernel);
+REGISTER_SVE256_DISPATCH(lstsq_stub, &lstsq_kernel);

 REGISTER_ARCH_DISPATCH(triangular_solve_stub, DEFAULT, &triangular_solve_kernel);
 REGISTER_AVX512_DISPATCH(triangular_solve_stub, &triangular_solve_kernel);
 REGISTER_AVX2_DISPATCH(triangular_solve_stub, &triangular_solve_kernel);
 REGISTER_VSX_DISPATCH(triangular_solve_stub, &triangular_solve_kernel);
 REGISTER_ZVECTOR_DISPATCH(triangular_solve_stub, &triangular_solve_kernel);
+REGISTER_SVE256_DISPATCH(triangular_solve_stub, &triangular_solve_kernel);

 REGISTER_ARCH_DISPATCH(lu_factor_stub, DEFAULT, &lu_factor_kernel);
 REGISTER_AVX512_DISPATCH(lu_factor_stub, &lu_factor_kernel);
 REGISTER_AVX2_DISPATCH(lu_factor_stub, &lu_factor_kernel);
 REGISTER_VSX_DISPATCH(lu_factor_stub, &lu_factor_kernel);
 REGISTER_ZVECTOR_DISPATCH(lu_factor_stub, &lu_factor_kernel);
+REGISTER_SVE256_DISPATCH(lu_factor_stub, &lu_factor_kernel);

 REGISTER_ARCH_DISPATCH(ldl_factor_stub, DEFAULT, &ldl_factor_kernel);
 REGISTER_AVX512_DISPATCH(ldl_factor_stub, &ldl_factor_kernel);
 REGISTER_AVX2_DISPATCH(ldl_factor_stub, &ldl_factor_kernel);
 REGISTER_VSX_DISPATCH(ldl_factor_stub, &ldl_factor_kernel);
 REGISTER_ZVECTOR_DISPATCH(ldl_factor_stub, &ldl_factor_kernel);
+REGISTER_SVE256_DISPATCH(ldl_factor_stub, &ldl_factor_kernel);

 REGISTER_ARCH_DISPATCH(ldl_solve_stub, DEFAULT, &ldl_solve_kernel);
 REGISTER_AVX512_DISPATCH(ldl_solve_stub, &ldl_solve_kernel);
 REGISTER_AVX2_DISPATCH(ldl_solve_stub, &ldl_solve_kernel);
 REGISTER_VSX_DISPATCH(ldl_solve_stub, &ldl_solve_kernel);
 REGISTER_ZVECTOR_DISPATCH(ldl_solve_stub, &ldl_solve_kernel);
+REGISTER_SVE256_DISPATCH(ldl_solve_stub, &ldl_solve_kernel);
+
 REGISTER_ARCH_DISPATCH(lu_solve_stub, DEFAULT, &lu_solve_kernel);
 REGISTER_AVX512_DISPATCH(lu_solve_stub, &lu_solve_kernel);
 REGISTER_AVX2_DISPATCH(lu_solve_stub, &lu_solve_kernel);
 REGISTER_VSX_DISPATCH(lu_solve_stub, &lu_solve_kernel);
 REGISTER_ZVECTOR_DISPATCH(lu_solve_stub, &lu_solve_kernel);
+REGISTER_SVE256_DISPATCH(lu_solve_stub, &lu_solve_kernel);

 REGISTER_ARCH_DISPATCH(svd_stub, DEFAULT, &svd_kernel);
 REGISTER_AVX512_DISPATCH(svd_stub, &svd_kernel);
 REGISTER_AVX2_DISPATCH(svd_stub, &svd_kernel);
 REGISTER_VSX_DISPATCH(svd_stub, &svd_kernel);
 REGISTER_ZVECTOR_DISPATCH(svd_stub, &svd_kernel);
+REGISTER_SVE256_DISPATCH(svd_stub, &svd_kernel);

 REGISTER_ARCH_DISPATCH(unpack_pivots_stub, DEFAULT, &unpack_pivots_cpu_kernel);
 REGISTER_AVX512_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
 REGISTER_AVX2_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
 REGISTER_VSX_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
 REGISTER_ZVECTOR_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
+REGISTER_SVE256_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
 } // namespace at::native
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -1663,13 +1663,7 @@ at::Tensor _convolution(
      break;
    case ConvBackend::Mps:
 #ifdef USE_MPS
-      TORCH_CHECK(input.options().type_equal(weight.options()),
-               "Input type (", input.toString(), ") and weight type (", weight.toString(),
-               ") should be the same");
-      TORCH_CHECK(!bias.defined() || (input.options().type_equal(bias.options())),
-               "Input type (", input.toString(), ") and bias type (", bias.toString(),
-               ") should be the same");
-
+      check_input_same_type_as_parameters(input, weight, bias);
      output = at::_mps_convolution(input, weight, bias.defined() ? bias.contiguous() : bias,
                                     params.padding, params.stride, params.dilation,
                                     params.groups);
@ -1679,12 +1673,7 @@ at::Tensor _convolution(
      break;
    case ConvBackend::MpsTranspose:
 #ifdef USE_MPS
-      TORCH_CHECK(input.options().type_equal(weight.options()),
-               "Input type (", input.toString(), ") and weight type (", weight.toString(),
-               ") should be the same");
-      TORCH_CHECK(!bias.defined() || (input.options().type_equal(bias.options())),
-               "Input type (", input.toString(), ") and bias type (", bias.toString(),
-               ") should be the same");
+      check_input_same_type_as_parameters(input, weight, bias);
      output = at::_mps_convolution_transpose(
          input.contiguous(backend_memory_format), weight,
          params.padding, params.output_padding,
--- a/aten/src/ATen/native/DispatchStub.cpp
+++ b/aten/src/ATen/native/DispatchStub.cpp
@ -34,6 +34,17 @@ static CPUCapability compute_cpu_capability() {
    if (strcmp(envar, "zvector") == 0) {
      return CPUCapability::ZVECTOR;
    }
+#elif defined(HAVE_SVE_CPU_DEFINITION)
+    int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
+#ifdef HAVE_SVE256_CPU_DEFINITION
+    if (strcmp(envar, "sve256") == 0) {
+      if (sve_vl == 256) {
+        return CPUCapability::SVE256;
+      }
+      TORCH_WARN("SVE256 capability not available on hardware. Falling back to DEFAULT");
+      return CPUCapability::DEFAULT;
+    }
+#endif
 #else
 #ifdef HAVE_AVX512_CPU_DEFINITION
    if (strcmp(envar, "avx512") == 0) {
@ -52,7 +63,7 @@ static CPUCapability compute_cpu_capability() {
    TORCH_WARN("ignoring invalid value for ATEN_CPU_CAPABILITY: ", envar);
  }

-#if !defined(__powerpc__) && !defined(__s390x__)
+#if !defined(__powerpc__) && !defined(__s390x__) && !defined(HAVE_SVE_CPU_DEFINITION)
  if (cpuinfo_initialize()) {
 #if defined(HAVE_AVX512_CPU_DEFINITION)
    // GCC supports some AVX512 intrinsics such as _mm512_set_epi16 only in
@ -79,6 +90,23 @@ static CPUCapability compute_cpu_capability() {
  }
 #endif

+#if defined(__linux__) && defined(HAVE_SVE_CPU_DEFINITION)
+  if (cpuinfo_initialize() && cpuinfo_has_arm_sve()) {
+    int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
+    if (sve_vl <= 0) {
+      // SVE is not supported on this system.
+      // Return the default CPU capability.
+      return CPUCapability::DEFAULT;
+    }
+    #ifdef HAVE_SVE256_CPU_DEFINITION
+        if (sve_vl == 256) { // Check for SVE256
+            return CPUCapability::SVE256;
+        }
+    #endif
+    // Return the default CPU capability.
+    return CPUCapability::DEFAULT;
+  }
+#endif
 #ifdef HAVE_VSX_CPU_DEFINITION
  return CPUCapability::VSX;
 #else
@ -106,6 +134,9 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
 #ifdef HAVE_ZVECTOR_CPU_DEFINITION
  , void *ZVECTOR
 #endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+  , void *SVE256
+#endif
 ) {
  constexpr auto supported_devices = c10::array_of<c10::DeviceType>(
        c10::DeviceType::CPU,
@ -139,6 +170,9 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
 #endif
 #ifdef HAVE_ZVECTOR_CPU_DEFINITION
          , ZVECTOR
+#endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+          , SVE256
 #endif
        );
        if (!std::holds_alternative<ErrorType>(result)) {
@ -191,6 +225,9 @@ void* DispatchStubImpl::get_call_ptr(
 #ifdef HAVE_ZVECTOR_CPU_DEFINITION
  , void *ZVECTOR
 #endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+  , void *SVE256
+#endif
 ) {

  auto result = try_get_call_ptr(
@ -211,6 +248,10 @@ void* DispatchStubImpl::get_call_ptr(
 #ifdef HAVE_ZVECTOR_CPU_DEFINITION
      ,
      ZVECTOR
+#endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+      ,
+      SVE256
 #endif
  );
  if (std::holds_alternative<ErrorType>(result)) {
@ -242,6 +283,9 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
 #endif
 #ifdef HAVE_ZVECTOR_CPU_DEFINITION
    , void *ZVECTOR
+#endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+    , void *SVE256
 #endif
  ){

@ -274,6 +318,16 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
  if (capability >= static_cast<int>(CPUCapability::ZVECTOR)) {
    return ZVECTOR != nullptr ? DispatchResult(ZVECTOR) : ErrorType::MissingDeviceKernel;
  }
+#endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+  if (capability >= static_cast<int>(CPUCapability::SVE256)) {
+    if (C10_UNLIKELY(!SVE256)) {
+      // dispatch to DEFAULT, since the SVE kernel is missing
+      return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
+    } else {
+      return DispatchResult(SVE256);
+    }
+  }
 #endif
  return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
 }
@ -292,6 +346,9 @@ void* DispatchStubImpl::choose_cpu_impl(
 #ifdef HAVE_ZVECTOR_CPU_DEFINITION
  , void *ZVECTOR
 #endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+  , void *SVE256
+#endif
 ) {
  auto capability = static_cast<int>(get_cpu_capability());
  (void)capability;
@ -326,6 +383,17 @@ void* DispatchStubImpl::choose_cpu_impl(
    TORCH_INTERNAL_ASSERT(ZVECTOR, "DispatchStub: missing ZVECTOR kernel");
    return ZVECTOR;
  }
+#endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+  if (capability >= static_cast<int>(CPUCapability::SVE256)) {
+    if (C10_UNLIKELY(!SVE256)) {
+      // dispatch to DEFAULT, since the SVE kernel is missing
+      TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
+      return DEFAULT;
+    } else {
+      return SVE256;
+    }
+  }
 #endif
  TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
  return DEFAULT;
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@ -64,6 +64,8 @@ enum class CPUCapability {
  VSX = 1,
 #elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
  ZVECTOR = 1,
+#elif defined(HAVE_SVE_CPU_DEFINITION)
+  SVE256 = 1,
 #else
  AVX2 = 1,
  AVX512 = 2,
@ -112,6 +114,9 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_ZVECTOR_CPU_DEFINITION
      , void *ZVECTOR
+#endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+      , void *SVE256
 #endif
  );

@ -130,6 +135,9 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_ZVECTOR_CPU_DEFINITION
    , void *ZVECTOR
+#endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+    , void *SVE256
 #endif
  );

@ -148,6 +156,9 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_ZVECTOR_CPU_DEFINITION
      , void *ZVECTOR
+#endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+      , void *SVE256
 #endif
  );

@ -169,6 +180,9 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_ZVECTOR_CPU_DEFINITION
    , void *ZVECTOR
+#endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+    , void *SVE256
 #endif
  );

@ -221,6 +235,9 @@ private:
 #endif
 #ifdef HAVE_ZVECTOR_CPU_DEFINITION
      , reinterpret_cast<void*>(ZVECTOR)
+#endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+      , reinterpret_cast<void*>(SVE256)
 #endif
      )
    );
@ -275,6 +292,9 @@ public:
 #endif
 #ifdef HAVE_ZVECTOR_CPU_DEFINITION
      , reinterpret_cast<void*>(ZVECTOR)
+#endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+      , reinterpret_cast<void*>(SVE256)
 #endif
      );
    if (std::holds_alternative<ErrorType>(result)){
@ -296,6 +316,9 @@ public:
 #ifdef HAVE_ZVECTOR_CPU_DEFINITION
  static TORCH_API FnPtr ZVECTOR;
 #endif
+#ifdef HAVE_SVE256_CPU_DEFINITION
+  static TORCH_API FnPtr SVE256;
+#endif
 private:
  DispatchStubImpl impl;
 };
@ -387,6 +410,12 @@ struct RegisterPRIVATEUSE1Dispatch {
 #define REGISTER_ZVECTOR_DISPATCH(name, fn)
 #endif

+#ifdef HAVE_SVE256_CPU_DEFINITION
+#define REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, SVE256, fn)
+#else
+#define REGISTER_SVE256_DISPATCH(name, fn)
+#endif
+
 // Macro to register the same kernel for all CPU arch types. This is useful
 // if a kernel does not benefit from being recompiled across different arch types.
 #define REGISTER_ALL_CPU_DISPATCH(name, fn)                                    \
@ -394,7 +423,8 @@ struct RegisterPRIVATEUSE1Dispatch {
  REGISTER_AVX512_DISPATCH(name, fn)                                           \
  REGISTER_AVX2_DISPATCH(name, fn)                                             \
  REGISTER_VSX_DISPATCH(name, fn)                                              \
-  REGISTER_ZVECTOR_DISPATCH(name, fn)
+  REGISTER_ZVECTOR_DISPATCH(name, fn)                                          \
+  REGISTER_SVE256_DISPATCH(name, fn)

 #define REGISTER_NO_CPU_DISPATCH(name)                                         \
  REGISTER_ALL_CPU_DISPATCH(name, nullptr)
@ -432,12 +462,14 @@ struct RegisterPRIVATEUSE1Dispatch {
 #elif defined(CPU_CAPABILITY)
 // REGISTER_DISPATCH now dispatches an AVX512 kernel to nullptr but registers other dispatches.
 // ALSO_REGISTER_AVX512_DISPATCH should be used for ensuring AVX512 dispatch, among others.
+// ALSO_REGISTER_SVE256_DISPATCH should be used for ensuring SVE256 dispatch, among others.
 #ifdef CPU_CAPABILITY_AVX512
 #define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, ((void*)(fn) ? nullptr : nullptr))
 #else
 #define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
 #endif
 #define ALSO_REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
+#define ALSO_REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
 #endif
 } // namespace at::native

--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@ -128,10 +128,26 @@ inline bool _check_tensors_share_device_and_dtype(
 // corresponding tensors in tensor lists have the same sizes and strides.
 inline bool _check_tensors_share_sizes_and_strides(
    ArrayRef<TensorList> tensorLists) {
+  auto is_diff_stride = [](const IntArrayRef& size,
+                           const IntArrayRef& left_stride,
+                           const IntArrayRef& right_stride) -> bool {
+    const size_t size_size = size.size();
+    for (const auto dim : c10::irange(size_size)) {
+      if (size[dim] == 1)
+        continue;
+      if (left_stride[dim] != right_stride[dim]) {
+        return true;
+      }
+    }
+    return false;
+  };
  for (const auto i : c10::irange(1, tensorLists.size())) {
    for (const auto j : c10::irange(tensorLists[0].size())) {
      if (tensorLists[0][j].sizes() != tensorLists[i][j].sizes() ||
-          tensorLists[0][j].strides() != tensorLists[i][j].strides()) {
+          is_diff_stride(
+              tensorLists[0][j].sizes(),
+              tensorLists[0][j].strides(),
+              tensorLists[i][j].strides())) {
        return false;
      }
    }
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@ -1366,7 +1366,6 @@ TORCH_IMPL_FUNC(mean_out)
        dim_prod *= self.size(d);
      }
    }
-    auto& result_mut = const_cast<Tensor&>(result);
    // For accuracy reasons, BF16/FP16 mean should be computed via the
    // following approach:
    //  cast_fp32 -> sum -> div -> cast_bf16_or_fp16
@ -1378,7 +1377,7 @@ TORCH_IMPL_FUNC(mean_out)
    // which, in turn, does not produce as accurate results.
    bool is_half_type = (dtype == kHalf || dtype == kBFloat16);
    auto sum_out_dtype = is_half_type ? ScalarType::Float : dtype;
-    result_mut = is_half_type ? result_mut.to(sum_out_dtype) : result_mut;
+    auto result_temp = is_half_type ? result.to(sum_out_dtype) : result;
    // If dtype is FP16 or BF16, self (input tensor) will initially be cast to
    // FP32 in sum_out. This results in having to read that FP32 tensor again,
    // but maybe in the future, we could revise the implementation to not
@ -1386,9 +1385,14 @@ TORCH_IMPL_FUNC(mean_out)
    // require some modifications in binary_kernel_reduce_vec(),
    // TensorIteratorBase::for_each(), and
    // TensorIteratorBase::serial_for_each(), apart from sum kernel for CPU.
-    at::sum_out(result_mut, self, opt_dim, keepdim, sum_out_dtype).div_(dim_prod);
-    // After sum & div, cast result_mut back to BF16 or FP16, if required.
-    result_mut = is_half_type ? result_mut.to(dtype) : result_mut;
+    at::sum_out(result_temp, self, opt_dim, keepdim, sum_out_dtype).div_(dim_prod);
+    // After sum & div, cast result_temp back to BF16 or FP16, if required.
+    // It cannot be avoided copy_() if we promotion the out of sum op, because of
+    // the result needs to be update and the storage of result tensor cannot be reused
+    // by sum op. We do not need explicit call to(dtype) func as copy_() do it.
+    if (is_half_type) {
+      result.copy_(result_temp);
+    }
  } else {
    // device is not CPU
    auto iter = at::meta::make_reduction_from_out_ty(
--- a/aten/src/ATen/native/SegmentReduce.cpp
+++ b/aten/src/ATen/native/SegmentReduce.cpp
@ -466,6 +466,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cp
 REGISTER_AVX512_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel);
 REGISTER_VSX_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel);
 REGISTER_ZVECTOR_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel);
+REGISTER_SVE256_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel);

 // offsets dispatches
 REGISTER_ARCH_DISPATCH(
@ -476,6 +477,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cp
 REGISTER_AVX512_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel);
 REGISTER_VSX_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel);
 REGISTER_ZVECTOR_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel);
+REGISTER_SVE256_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel);

 // Currently some computation is being duplicated across forward and backward.
 // TODO: Cache indices in forward pass to re-use in backward
@ -546,6 +548,9 @@ REGISTER_VSX_DISPATCH(
 REGISTER_ZVECTOR_DISPATCH(
    _segment_reduce_lengths_backward_stub,
    &_segment_reduce_cpu_lengths_backward_kernel);
+REGISTER_SVE256_DISPATCH(
+    _segment_reduce_lengths_backward_stub,
+    &_segment_reduce_cpu_lengths_backward_kernel);

 REGISTER_ARCH_DISPATCH(
    _segment_reduce_offsets_backward_stub,
@ -563,5 +568,8 @@ REGISTER_VSX_DISPATCH(
 REGISTER_ZVECTOR_DISPATCH(
    _segment_reduce_offsets_backward_stub,
    &_segment_reduce_cpu_offsets_backward_kernel);
+REGISTER_SVE256_DISPATCH(
+    _segment_reduce_offsets_backward_stub,
+    &_segment_reduce_cpu_offsets_backward_kernel);

 } // namespace at::native
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -82,7 +82,6 @@ namespace at::meta {
 static inline void check_for_unsupported_isin_dtype(const ScalarType type) {
  // Bail out for dtypes unsupported by the sorting algorithm to keep the interface consistent.
  TORCH_CHECK(type != ScalarType::Bool &&
-      type != ScalarType::BFloat16 &&
      type != ScalarType::ComplexFloat &&
      type != ScalarType::ComplexDouble,
      "Unsupported input type encountered for isin(): ", type);
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@ -772,9 +772,6 @@ inline SymDimVector compute_strides_for_view_dtype_upsize(SymIntArrayRef old_str
 }

 Tensor view_dtype(const Tensor& self, ScalarType dtype) {
-  if (self.scalar_type() == dtype) {
-    return self;
-  }
  const auto type_meta = c10::scalarTypeToTypeMeta(dtype);
  TORCH_CHECK(!self.is_conj(),
    "torch.Tensor.view is not supported for conjugate view tensors when converting to a different dtype.");
--- a/aten/src/ATen/native/cpu/BlasKernel.cpp
+++ b/aten/src/ATen/native/cpu/BlasKernel.cpp
@ -341,8 +341,8 @@ void gemm_notrans_(
    at::Half* c,
    int64_t ldc) {
  // c += alpha * (a @ b)
-  if (n == 1 && beta == 0.0) {
-    at::native::blas_impl::fp16_gemv_notrans(m, k, alpha, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, beta, reinterpret_cast<float16_t*>(c), 1);
+  if (n == 1 && beta == 0.0 && alpha == 1.0) {
+    at::native::blas_impl::fp16_gemv_notrans(m, k, 1.0, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, 0.0, reinterpret_cast<float16_t*>(c), 1);
    return;
  }
  for (const auto i : c10::irange(m)) {
@ -388,8 +388,8 @@ void gemm_transa_(
    float beta,
    at::Half *c, int64_t ldc) {
  // c = alpha * (a.T @ b) + beta * c
-  if (n == 1 && beta == 0.0) {
-    at::native::blas_impl::fp16_gemv_trans(k, m, alpha, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, beta, reinterpret_cast<float16_t*>(c), 1);
+  if (n == 1 && beta == 0.0 && alpha == 1.0) {
+    at::native::blas_impl::fp16_gemv_trans(k, m, 1.0, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, 0.0, reinterpret_cast<float16_t*>(c), 1);
    return;
  }
  parallel_for(0, m, 1, [&](int64_t begin, int64_t end) {
--- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
+++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
@ -23,7 +23,7 @@ namespace {
 // out = val * a + b
 // is_b_stride_zero: If the stride of b is 0 (mask broadcasting case),
 //                take b as a scalar pointer.
-#if __GNUC__ == 11 && __GNUC_MINOR__ >= 4 && defined(__ARM_FEATURE_SVE)
+#if __GNUC__ == 11 && defined(__ARM_FEATURE_SVE)
 template <typename T1, typename T2>
 inline void _scale_attn_mask_fusion_kernel(
    T1* a,
@ -51,7 +51,7 @@ inline void _scale_attn_mask_fusion_kernel(
  for (; i < size - (size % vec_size2); i += vec_size2) {
    auto a_n = at::vec::VectorizedN<T1, T1_n>::loadu(a + i);
    at::vec::VectorizedN<T2, T2_n> b_n;
-#if __GNUC__ == 11 && __GNUC_MINOR__ >= 4 && defined(__ARM_FEATURE_SVE)
+#if __GNUC__ == 11 && defined(__ARM_FEATURE_SVE)
    if (is_b_stride_zero) {
 #else
    if constexpr(is_b_stride_zero) {
@ -67,7 +67,7 @@ inline void _scale_attn_mask_fusion_kernel(
  for (; i < size; i++) {
    auto tmp0 = a[i];
    T1 tmp1;
-#if __GNUC__ == 11 && __GNUC_MINOR__ >= 4 && defined(__ARM_FEATURE_SVE)
+#if __GNUC__ == 11 && defined(__ARM_FEATURE_SVE)
    if (is_b_stride_zero) {
 #else
    if constexpr(is_b_stride_zero) {
@ -646,7 +646,7 @@ void cpu_flash_attention(
        // qk <- qk * scaling + attn_mask
        if (has_attn_mask) {
          for (int64_t row = 0; row < qBlockSize; ++row) {
-#if __GNUC__ == 11 && __GNUC_MINOR__ >= 4 && defined(__ARM_FEATURE_SVE)
+#if __GNUC__ == 11 && defined(__ARM_FEATURE_SVE)
              _scale_attn_mask_fusion_kernel(
                qk_data + row * rkvBlockSize,
                mask_data + i * mStrideB + j * mStrideH +
@ -968,7 +968,7 @@ void cpu_flash_attention_backward(
          if (has_attn_mask) {
            accum_t one = accum_t(1);
            for (const auto row : c10::irange(qBlockSize)) {
-#if __GNUC__ == 11 && __GNUC_MINOR__ >= 4 && defined(__ARM_FEATURE_SVE)
+#if __GNUC__ == 11 && defined(__ARM_FEATURE_SVE)
                _scale_attn_mask_fusion_kernel(
                  attn_data + row * kvBlockSize,
                  mask_data + i * mStrideB + j * mStrideH +
--- a/aten/src/ATen/native/cpu/LerpKernel.cpp
+++ b/aten/src/ATen/native/cpu/LerpKernel.cpp
@ -19,7 +19,7 @@ Vectorized<scalar_t> is_lerp_weight_small(Vectorized<scalar_t> weight) {
 // is_lerp_weight_small doesn't work for complex because z.abs() returns a
 // complex vector which can't be compared. Either implement it with z.abs_2_(),
 // or fallback to the scalar function.
-#if !(defined(CPU_CAPABILITY_DEFAULT) || defined(_MSC_VER))
+#if !(defined(CPU_CAPABILITY_DEFAULT) || defined(_MSC_VER) || defined(CPU_CAPABILITY_SVE))
 template <typename value_t>
 Vectorized<c10::complex<value_t>> is_lerp_weight_small(Vectorized<c10::complex<value_t>> weight) {
  using vec_reg_t = decltype(weight.abs_2_());
--- a/aten/src/ATen/native/cpu/PaddingKernel.cpp
+++ b/aten/src/ATen/native/cpu/PaddingKernel.cpp
@ -486,7 +486,7 @@ void reflection_pad1d_kernel_impl(const Tensor& output, const Tensor& input, Int
      cpu_padding<scalar_t, ReflectionPad>(output, input, param);
    });
  } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, input.scalar_type(),
        "reflection_pad1d", [&] {
      cpu_padding<scalar_t, ReflectionPad>(output, input, param);
    });
@ -496,7 +496,7 @@ void reflection_pad1d_kernel_impl(const Tensor& output, const Tensor& input, Int
 void reflection_pad1d_backward_kernel_impl(
    const Tensor& grad_input, const Tensor& grad_output, IntArrayRef padding) {
  PaddingParams param{grad_input, grad_output, padding};
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
      "reflection_pad1d_backward", [&] {
    cpu_padding_backward<scalar_t, ReflectionPad>(grad_input, grad_output, param);
  });
@ -513,14 +513,14 @@ void reflection_pad2d_kernel_impl(const Tensor& output, const Tensor& input, Int
  } else {
    switch (input.suggest_memory_format()) {
      case at::MemoryFormat::Contiguous: {
-        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
+        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, input.scalar_type(),
            "reflection_pad2d", [&] {
          cpu_padding<scalar_t, ReflectionPad>(output, input, param);
        });
        break;
      }
      case at::MemoryFormat::ChannelsLast: {
-        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
+        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, input.scalar_type(),
            "reflection_pad2d_channels_last", [&]{
          cpu_padding_channels_last<scalar_t, ReflectionPad>(output, input, param);
        });
@ -537,14 +537,14 @@ void reflection_pad2d_backward_kernel_impl(
  PaddingParams param{grad_input, grad_output, padding};
  switch (grad_output.suggest_memory_format()) {
    case at::MemoryFormat::Contiguous: {
-      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
+      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
          "reflection_pad2d_backward", [&] {
        cpu_padding_backward<scalar_t, ReflectionPad>(grad_input, grad_output, param);
      });
      break;
    }
    case at::MemoryFormat::ChannelsLast: {
-      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
+      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
          "reflection_pad2d_backward_channels_last", [&]{
        cpu_padding_backward_channels_last<scalar_t, ReflectionPad>(grad_input, grad_output, param);
      });
@ -603,7 +603,7 @@ void reflection_pad3d_backward_kernel_impl(
 // replication padding
 void replication_pad1d_kernel_impl(const Tensor& output, const Tensor& input, IntArrayRef padding) {
  PaddingParams param{input, output, padding};
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf,input.scalar_type(),
      "replication_pad1d", [&] {
    cpu_padding<scalar_t, ReplicationPad>(output, input, param);
  });
@ -612,7 +612,7 @@ void replication_pad1d_kernel_impl(const Tensor& output, const Tensor& input, In
 void replication_pad1d_backward_kernel_impl(
    const Tensor& grad_input, const Tensor& grad_output, IntArrayRef padding) {
  PaddingParams param{grad_input, grad_output, padding};
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
      "replication_pad1d_backward", [&] {
    cpu_padding_backward<scalar_t, ReplicationPad>(grad_input, grad_output, param);
  });
@ -622,14 +622,14 @@ void replication_pad2d_kernel_impl(const Tensor& output, const Tensor& input, In
  PaddingParams param{input, output, padding};
  switch (input.suggest_memory_format()) {
    case at::MemoryFormat::Contiguous: {
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, input.scalar_type(),
          "replication_pad2d", [&] {
        cpu_padding<scalar_t, ReplicationPad>(output, input, param);
      });
      break;
    }
    case at::MemoryFormat::ChannelsLast: {
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, input.scalar_type(),
          "replication_pad2d_channels_last", [&]{
        cpu_padding_channels_last<scalar_t, ReplicationPad>(output, input, param);
      });
@ -645,14 +645,14 @@ void replication_pad2d_backward_kernel_impl(
  PaddingParams param{grad_input, grad_output, padding};
  switch (grad_output.suggest_memory_format()) {
    case at::MemoryFormat::Contiguous: {
-      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
+      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
          "replication_pad2d_backward", [&] {
        cpu_padding_backward<scalar_t, ReplicationPad>(grad_input, grad_output, param);
      });
      break;
    }
    case at::MemoryFormat::ChannelsLast: {
-      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
+      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
          "replication_pad2d_backward_channels_last", [&]{
        cpu_padding_backward_channels_last<scalar_t, ReplicationPad>(grad_input, grad_output, param);
      });
@ -667,14 +667,14 @@ void replication_pad3d_kernel_impl(const Tensor& output, const Tensor& input, In
  PaddingParams param{input, output, padding};
  switch (padding_memory_format_3d(input)) {
    case at::MemoryFormat::Contiguous: {
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, input.scalar_type(),
          "replication_pad3d", [&] {
        cpu_padding<scalar_t, ReplicationPad>(output, input, param);
      });
      break;
    }
    case at::MemoryFormat::ChannelsLast3d: {
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, input.scalar_type(),
          "replication_pad3d_channels_last", [&]{
        cpu_padding_channels_last<scalar_t, ReplicationPad>(output, input, param);
      });
@ -690,14 +690,14 @@ void replication_pad3d_backward_kernel_impl(
  PaddingParams param{grad_input, grad_output, padding};
  switch (padding_memory_format_3d(grad_output)) {
    case at::MemoryFormat::Contiguous: {
-      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
+      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
          "replication_pad3d_backward", [&] {
        cpu_padding_backward<scalar_t, ReplicationPad>(grad_input, grad_output, param);
      });
      break;
    }
    case at::MemoryFormat::ChannelsLast3d: {
-      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
+      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
          "replication_pad3d_backward_channels_last", [&]{
        cpu_padding_backward_channels_last<scalar_t, ReplicationPad>(grad_input, grad_output, param);
      });
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -239,22 +239,38 @@ static void norm_kernel_tensor_iterator_impl(

          using Vec = Vectorized<scalar_t>;
          using fVec = Vectorized<acc_t>;
-          fVec acc_vec{acc_t(0)};
          acc_t buffer[fVec::size()];
-          int64_t d = 0;
-          for (; d < size - (size % Vec::size()); d += Vec::size()) {
-            Vec data_vec = Vec::loadu(self_data + d);
-            norm_two_reduce_step(acc_vec, data_vec);
+          auto inner_reduction = [&buffer](scalar_t* inner_self_data, int64_t inner_size) -> acc_t {
+            fVec acc_vec{acc_t(0)};
+            int64_t d = 0;
+            for (; d < inner_size - (inner_size % Vec::size()); d += Vec::size()) {
+              Vec data_vec = Vec::loadu(inner_self_data + d);
+              norm_two_reduce_step(acc_vec, data_vec);
+            }
+            acc_vec.store(buffer);
+            for (int j = 1; j < fVec::size(); j++) {
+              buffer[0] = buffer[0] + buffer[j];
+            }
+            for (; d < inner_size; d++) {
+              acc_t data_val = acc_t(inner_self_data[d]);
+              buffer[0] += data_val * data_val;
+            }
+            return buffer[0];
+          };
+
+          // Use group reduction to avoid overflow.
+          // See https://github.com/pytorch/pytorch/pull/123416
+          int64_t group_size = 32768L;
+          int64_t group_n = (size + group_size - 1) / group_size;
+          scalar_t* inner_self_data = self_data;
+          int64_t inner_size = group_size;
+          double result = 0;
+          for (int64_t g = 0; g < group_n; g++) {
+            inner_size = (g * inner_size + group_size) > size ? (size - g * inner_size) : group_size;
+            result += inner_reduction(inner_self_data, inner_size);
+            inner_self_data += inner_size;
          }
-          acc_vec.store(buffer);
-          for (int j = 1; j < fVec::size(); j++) {
-            buffer[0] = buffer[0] + buffer[j];
-          }
-          for (; d < size; d++) {
-            acc_t data_val = acc_t(self_data[d]);
-            buffer[0] += data_val * data_val;
-          }
-          result_data[0] = scalar_t(std::sqrt(buffer[0]));
+          result_data[0] = scalar_t(std::sqrt(result));
        });
      });
  } else {
--- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
+++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
@ -325,7 +325,7 @@ static void isin_default_kernel_cpu(
    .check_all_same_dtype(false)
    .build();
  // Dispatch based on promoted type.
-  AT_DISPATCH_ALL_TYPES(iter.dtype(1), "isin_default_cpu", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.dtype(1), "isin_default_cpu", [&]() {
    cpu_kernel(iter, [&](scalar_t element_val) -> bool {
      const auto* test_element_data = test_elements_flat.const_data_ptr<scalar_t>();
      for (const auto j : c10::irange(test_elements_flat.numel())) {
--- a/aten/src/ATen/native/cuda/ActivationGluKernel.cu
+++ b/aten/src/ATen/native/cuda/ActivationGluKernel.cu
@ -68,7 +68,7 @@ void glu_jvp_kernel(TensorIteratorBase& iter) {
 template <typename T>
 __device__ T* byte_offset(T* ptr, int64_t offset) {
  using byte_ptr_t = typename std::
-      conditional<std::is_const<T>::value, const char*, char*>::type;
+      conditional_t<std::is_const_v<T>, const char*, char*>;
  return reinterpret_cast<T*>(reinterpret_cast<byte_ptr_t>(ptr) + offset);
 }

--- a/aten/src/ATen/native/cuda/BinaryInternal.h
+++ b/aten/src/ATen/native/cuda/BinaryInternal.h
@ -15,9 +15,7 @@

 #include <type_traits>

-namespace at {
-namespace native {
-namespace binary_internal {
+namespace at::native::binary_internal {

 template <typename scalar_t>
 struct DivFunctor {
@ -43,6 +41,4 @@ struct MulFunctor<bool> {
 };
 void div_true_kernel_cuda(TensorIteratorBase& iter);
 void div_trunc_kernel_cuda(TensorIteratorBase& iter);
-} // namespace binary_internal
-} // namespace native
-} // namespace at
+} // namespace at::native::binary_internal
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -95,7 +95,7 @@ c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, b

 struct cublasCommonArgs {
  cublasCommonArgs(const Tensor& mat1, const Tensor& mat2, Tensor& c) {
-    bool transpose_result, transpose_mat1, transpose_mat2;
+    bool transpose_result = false, transpose_mat1 = false, transpose_mat2 = false;
    result = prepare_matrix_for_cublas(c, transpose_result);
    mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_mat1, transpose_result);
    matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_mat2, transpose_result);
@ -263,6 +263,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
    "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
  )

+  // NOLINTNEXTLINE(*c-array*)
  TensorArg targs[]{{result, "out", 0}, {self, "self", 1}, {mat1, "mat1", 2}, {mat2, "mat2", 3}};
  checkAllSameGPU(__func__, targs);

@ -483,9 +484,11 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        });
    switch (activation) {
      case Activation::RELU:
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
        at::relu_(const_cast<Tensor&>(*args.result));
        break;
      case Activation::GELU:
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
        at::gelu_(const_cast<Tensor&>(*args.result), "tanh");
        break;
      default: break;
@ -542,8 +545,8 @@ const Tensor& baddbmm_out_cuda_impl(const Tensor& result, const Tensor& self, co
  int64_t n = result_sizes[leading_dim];
  int64_t k = (transpose_result ? batch2 : batch1).sizes()[leading_dim];

-  int64_t lda, ldb, ldc;
-  bool transpose_batch1, transpose_batch2;
+  int64_t lda = 0, ldb = 0, ldc = 0;
+  bool transpose_batch1 = false, transpose_batch2 = false;
  auto batch1_ = prepare_batch_matrix_for_cublas(transpose_result ? batch2 : batch1, transpose_batch1, lda, transpose_result, m, k);
  auto batch2_ = prepare_batch_matrix_for_cublas(transpose_result ? batch1 : batch2, transpose_batch2, ldb, transpose_result, k, n);

@ -593,14 +596,17 @@ const Tensor& baddbmm_out_cuda_impl(const Tensor& result, const Tensor& self, co
 } // anonymous namespace

 TORCH_IMPL_FUNC(addmm_out_cuda)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, const Tensor& result) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
  addmm_out_cuda_impl(const_cast<Tensor&>(result), self, mat1, mat2, beta, alpha);
 }

 TORCH_IMPL_FUNC(addmm_activation_out_cuda)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, bool use_gelu, const Tensor& result) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
  addmm_out_cuda_impl(const_cast<Tensor&>(result), self, mat1, mat2, beta, alpha, use_gelu ? Activation::GELU : Activation::RELU);
 }

 TORCH_IMPL_FUNC(mm_out_cuda)(const Tensor& self, const Tensor& mat2, const Tensor& result) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
  addmm_out_cuda_impl(const_cast<Tensor&>(result), result, self, mat2, 0, 1);
 }

@ -765,6 +771,7 @@ TORCH_IMPL_FUNC(addmv_out_cuda)(const Tensor &self, const Tensor &mat, const Ten
      result.zero_();
    } else {
      at::mul_out(
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
          const_cast<Tensor&>(result),
          self,
          at::native::scalar_tensor(
@ -772,6 +779,7 @@ TORCH_IMPL_FUNC(addmv_out_cuda)(const Tensor &self, const Tensor &mat, const Ten
    }
  } else {
    if (!result.is_same(*self_) && betaval != 0.0) { //if beta is 0, result contents will be zeroed later
+                                                            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
      at::native::copy_(const_cast<Tensor&>(result), *self_);
    }
    if (result.numel() != 0) {
@ -1040,6 +1048,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
    auto bias_ = bias.value_or(Tensor());
    auto scale_result_ = scale_result.value_or(Tensor());

+    // NOLINTNEXTLINE(*c-array*)
    TensorArg targs[]{{out, "out", 0}, {mat1, "mat1", 1}, {mat2, "mat2", 2},
                      {bias_, "bias", 3}, {scale_a, "scale_a", 4}, {scale_b, "scale_b", 5},
                      {scale_result_, "scale_result", 6}};
--- a/aten/src/ATen/native/cuda/CUDAJitLoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDAJitLoops.cuh
@ -24,8 +24,7 @@
 #include <tuple>
 #include <mutex>

-namespace at {
-namespace native {
+namespace at::native {

 template <typename Tuple, std::size_t... I>
 constexpr auto tuple_to_array_helper(Tuple& t, std::index_sequence<I...> seq) {
@ -291,6 +290,6 @@ static void jitted_gpu_kernel_impl(
    );
 }

-}}  // at::native
+}  // at::native

 #endif // AT_USE_JITERATOR()
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@ -50,8 +50,7 @@
 #define ASSERT_HOST_DEVICE_LAMBDA(type)
 #endif

-namespace at {
-namespace native {
+namespace at::native {

 template <int vec_size, typename func_t, typename array_t>
 C10_LAUNCH_BOUNDS_1(num_threads())
@ -344,5 +343,4 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
  }
 }

-} // namespace native
-} // namespace at
+} // namespace at::native
--- a/aten/src/ATen/native/cuda/Copy.h
+++ b/aten/src/ATen/native/cuda/Copy.h
@ -5,6 +5,7 @@ struct TensorIteratorBase;

 namespace native {

-void direct_copy_kernel_cuda(TensorIteratorBase &iter);
+void direct_copy_kernel_cuda(TensorIteratorBase& iter);

-}}  // namespace at::native
+}
+} // namespace at
--- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@ -58,7 +58,7 @@ struct CuFFTParams
  }
 };

-static_assert(std::is_trivial<CuFFTParams>::value, "");
+static_assert(std::is_trivial_v<CuFFTParams>, "");

 // Returns true if the transform type has complex input
 inline bool cufft_complex_input(CuFFTTransformType type) {
--- a/aten/src/ATen/native/cuda/DeviceSqrt.cuh
+++ b/aten/src/ATen/native/cuda/DeviceSqrt.cuh
@ -1,6 +1,6 @@
 #pragma once

-namespace at { namespace native {
+namespace at::native {
 #if defined(USE_ROCM)
 // take these out when ROCm implements std:: math functions
 #include <math.h>
@ -22,4 +22,4 @@ __forceinline__ __device__ double device_sqrt(scalar_t val) {
  return std::sqrt(val);
 }
 #endif
-}}
+} // namespace at::native
--- a/aten/src/ATen/native/cuda/DistributionTemplates.h
+++ b/aten/src/ATen/native/cuda/DistributionTemplates.h
@ -233,7 +233,7 @@ __global__ void distribution_binary_elementwise_kernel(

 template <typename func_t>
 void distribution_binary_kernel(TensorIteratorBase &iter, PhiloxCudaState philox_args, const func_t &f) {
-  static_assert(std::is_same<typename function_traits<func_t>::template arg<0>::type, curandStatePhilox4_32_10_t&>::value, "the first argument of functor must be curandStatePhilox4_32_10_t");
+  static_assert(std::is_same_v<typename function_traits<func_t>::template arg<0>::type, curandStatePhilox4_32_10_t&>, "the first argument of functor must be curandStatePhilox4_32_10_t");
  using input_t_1 = typename function_traits<func_t>::template arg<1>::type;
  using input_t_2 = typename function_traits<func_t>::template arg<2>::type;
  using output_t = typename function_traits<func_t>::result_type;
@ -287,10 +287,10 @@ template<typename RNG>
 void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG gen) {
  AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_cuda", AT_WRAP([&] {
    if ((
-      std::is_same<scalar_t, int64_t>::value ||
-      std::is_same<scalar_t, double>::value ||
-      std::is_same<scalar_t, float>::value ||
-      std::is_same<scalar_t, at::BFloat16>::value) && range >= 1ULL << 32)
+      std::is_same_v<scalar_t, int64_t> ||
+      std::is_same_v<scalar_t, double> ||
+      std::is_same_v<scalar_t, float> ||
+      std::is_same_v<scalar_t, at::BFloat16>) && range >= 1ULL << 32)
    {
      // define lambda to mod with range and add base
      auto random_func = [range, base] __device__ (uint64_t rand) {
@ -326,10 +326,10 @@ void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t bas
 template<typename RNG>
 void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG gen) {
  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel_cuda", [&] {
-    if (std::is_same<scalar_t, int64_t>::value ||
-        std::is_same<scalar_t, double>::value ||
-        std::is_same<scalar_t, float>::value ||
-        std::is_same<scalar_t, at::BFloat16>::value) {
+    if (std::is_same_v<scalar_t, int64_t> ||
+        std::is_same_v<scalar_t, double> ||
+        std::is_same_v<scalar_t, float> ||
+        std::is_same_v<scalar_t, at::BFloat16>) {
      auto random_func = [] __device__ (uint64_t rand) {
        return transformation::uniform_int_full_range<scalar_t>(rand);
      };
@ -362,7 +362,7 @@ struct RandomFromToKernel {
 template<typename RNG>
 void random_kernel(TensorIteratorBase& iter, RNG gen) {
  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "random_kernel_cuda", [&] {
-    if (std::is_same<scalar_t, double>::value || std::is_same<scalar_t, int64_t>::value) {
+    if (std::is_same_v<scalar_t, double> || std::is_same_v<scalar_t, int64_t>) {
      auto random_func = [] __device__ (uint64_t rand) {
        return transformation::uniform_int<scalar_t>(rand);
      };
@ -400,7 +400,7 @@ struct RandomKernel {

 template<typename scalar_t, typename accscalar_t, typename RNG, typename transform_t>
 void uniform_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) {
-  if (std::is_same<scalar_t, double>::value) {
+  if (std::is_same_v<scalar_t, double>) {
    distribution_nullary_kernel<scalar_t, accscalar_t, double2>(iter,
      gen,
      [] __device__ (curandStatePhilox4_32_10_t* state) -> double2 { return curand_uniform2_double(state); },
@ -415,7 +415,7 @@ void uniform_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transf

 template<typename scalar_t, typename accscalar_t, typename RNG, typename transform_t>
 void normal_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) {
-  if (std::is_same<scalar_t, double>::value) {
+  if (std::is_same_v<scalar_t, double>) {
    distribution_nullary_kernel<scalar_t, accscalar_t, double2>(iter,
      gen,
      [] __device__ (curandStatePhilox4_32_10_t* state) -> double2 { return curand_normal2_double(state); },
@ -637,7 +637,7 @@ void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG gen) {
  auto p = expand_inplace(self, p_cuda);
  AT_DISPATCH_ALL_TYPES_AND3(
    at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "bernoulli_tensor_cuda_self_", [&] {
-      if (std::is_same<scalar_t, double>::value) {
+      if (std::is_same_v<scalar_t, double>) {
        return bernoulli_tensor_cuda_kernel<double, double>(self, *p, rng_engine_inputs);
      } else {
        return bernoulli_tensor_cuda_kernel<scalar_t, float>(self, *p, rng_engine_inputs);
--- a/aten/src/ATen/native/cuda/Distributions.cpp
+++ b/aten/src/ATen/native/cuda/Distributions.cpp
@ -18,6 +18,7 @@

 namespace at::native {

+// NOLINTNEXTLINE(performance-unnecessary-value-param)
 Tensor _s_poisson_cuda(const Tensor& lambda, std::optional<Generator> gen_) {
  auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
  Tensor ret = at::empty(lambda.sizes(), lambda.options());
@ -25,6 +26,7 @@ Tensor _s_poisson_cuda(const Tensor& lambda, std::optional<Generator> gen_) {
  return ret;
 }

+// NOLINTNEXTLINE(performance-unnecessary-value-param)
 Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, std::optional<Generator> gen_) {
  auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
  Tensor ret = at::empty(count.sizes(), count.options());
@ -37,6 +39,7 @@ Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, std::optional<G
  return ret;
 }

+// NOLINTNEXTLINE(performance-unnecessary-value-param)
 Tensor _s_gamma_cuda(const Tensor& alpha, std::optional<Generator> gen_) {
  auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
  Tensor ret = at::empty(alpha.sizes(), alpha.options());
@ -44,6 +47,7 @@ Tensor _s_gamma_cuda(const Tensor& alpha, std::optional<Generator> gen_) {
  return ret;
 }

+// NOLINTNEXTLINE(performance-unnecessary-value-param)
 Tensor _s_dirichlet_cuda(const Tensor& alpha, std::optional<Generator> gen_) {
  auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
  Tensor ret = at::empty(alpha.sizes(), alpha.options());
--- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh
+++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh
@ -4,8 +4,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/TensorUtils.h>

-namespace at {
-namespace native {
+namespace at::native {

 Tensor embedding_backward_cuda_kernel(
    const Tensor &grad,
@ -19,4 +18,4 @@ Tensor embedding_backward_cuda_kernel(
    const Tensor &bag_size = Tensor(),
    const Tensor &per_sample_weights = Tensor());

-}}
+} // namespace at::native
--- a/aten/src/ATen/native/cuda/FusedAdamWKernel.cu
+++ b/aten/src/ATen/native/cuda/FusedAdamWKernel.cu
@ -5,8 +5,7 @@
 #include <ATen/native/cuda/fused_adamw_amsgrad_impl.cuh>
 #include <ATen/native/cuda/fused_adamw_impl.cuh>

-namespace at {
-namespace native {
+namespace at::native {

 // note(crcrpar): To observe the CI rules, i.e. 20 minutes per file to compile,
 // defensively split instantiations into _impl files. this is only for CUDA 11.3
@ -168,5 +167,4 @@ void _fused_adamw_kernel_cuda_(
  }
 }

-} // namespace native
-} // namespace at
+} // namespace at::native
--- a/aten/src/ATen/native/cuda/GridSampler.cuh
+++ b/aten/src/ATen/native/cuda/GridSampler.cuh
@ -2,7 +2,7 @@
 #include <ATen/native/cuda/KernelUtils.cuh>
 #include <ATen/native/GridSamplerUtils.h>

-namespace at { namespace native {
+namespace at::native {

 using detail::GridSamplerInterpolation;
 using detail::GridSamplerPadding;
@ -318,4 +318,4 @@ void get_cubic_coefficients_grad(
 }


-}}  // namespace at::native
+}  // namespace at::native
--- a/aten/src/ATen/native/cuda/IGammaKernel.cu
+++ b/aten/src/ATen/native/cuda/IGammaKernel.cu
@ -126,7 +126,7 @@ __host__ __device__ scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {

  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
  accscalar_t ax, fac, res, num, numfac;
-  static const accscalar_t MAXLOG = std::is_same<accscalar_t,double>::value ?
+  static const accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
    7.09782712893383996843E2 : 88.72283905206835;
  static const accscalar_t EXP1 = 2.718281828459045;
  static const accscalar_t lanczos_g = 6.024680040776729583740234375;
@ -158,7 +158,7 @@ __host__ __device__ scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
  // Compute igam using DLMF 8.11.4. [igam1]

  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  static const accscalar_t MACHEP = std::is_same<accscalar_t, double>::value ?
+  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
    1.11022302462515654042E-16 : 5.9604644775390625E-8;
  static const int MAXITER = 2000;

@ -197,7 +197,7 @@ __host__ __device__ scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
  accscalar_t sum = 0;
  accscalar_t term, logx;
  static const int MAXITER = 2000;
-  static const accscalar_t MACHEP = std::is_same<accscalar_t, double>::value ?
+  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
    1.11022302462515654042E-16 : 5.9604644775390625E-8;

  for (n = 1; n < MAXITER; n++) {
@ -248,7 +248,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t

  int k, n, sgn;
  int maxpow = 0;
-  static const accscalar_t MACHEP = std::is_same<accscalar_t, double>::value ?
+  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
    1.11022302462515654042E-16 : 5.9604644775390625E-8;
  accscalar_t lambda = x / a;
  accscalar_t sigma = (x - a) / a;
@ -315,11 +315,11 @@ __host__ __device__ scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar
  accscalar_t ans, ax, c, yc, r, t, y, z;
  accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
  static const int MAXITER = 2000;
-  static const accscalar_t MACHEP = std::is_same<accscalar_t, double>::value ?
+  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
    1.11022302462515654042E-16 : 5.9604644775390625E-8;
-  static const accscalar_t BIG = std::is_same<accscalar_t,double>::value ?
+  static const accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
    4.503599627370496e15 : 16777216.;
-  static const accscalar_t BIGINV = std::is_same<accscalar_t,double>::value ?
+  static const accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
    2.22044604925031308085e-16 : 5.9604644775390625E-8;

  ax = _igam_helper_fac(a, x);
--- a/Show More
+++ b/Show More