document distributed apis

[DeviceMesh] Move global state into class method (#164510 )
This is PR trying to move bookkeeping state maps from MeshEnv to DeviceMesh class members. The reason is that in general global variables are thread local and cause potential issue. We will also need to do DTensor CPU overhead benchmark for this change. 3-5% CPU overhead in DTensor has been observed: before: <img width="1147" height="535" alt="image" src="https://github.com/user-attachments/assets/9e4ac018-ec0a-46a4-8f2c-64b4dbec465c" /> After: <img width="1114" height="576" alt="image" src="https://github.com/user-attachments/assets/eaf83660-652b-4c6b-8591-f6049ccdd14c" /> running the benchmark mentioned here: https://github.com/pytorch/pytorch/issues/159169 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164510 Approved by: https://github.com/lw, https://github.com/fegin
2025-10-25 08:11:06 +08:00 · 2025-10-10 15:40:20 -07:00 · 2025-10-10 21:37:17 +00:00 · 2025-10-10 21:31:56 +00:00 · 2025-10-10 21:28:58 +00:00 · 2025-10-10 21:27:05 +00:00
685 changed files with 9467 additions and 6965 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -181,7 +181,7 @@ case "$tag" in
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
-    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
+    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100"
    if [[ $tag =~ "benchmarks" ]]; then
      INDUCTOR_BENCHMARKS=yes
    fi
@ -344,7 +344,7 @@ docker build \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
-       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx1100}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
--- a/.ci/docker/manywheel/build_scripts/ssl-check.py
+++ b/.ci/docker/manywheel/build_scripts/ssl-check.py
@ -10,11 +10,6 @@ BAD_SSL = "https://self-signed.badssl.com"

 print("Testing SSL certificate checking for Python:", sys.version)

-if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
-    print("This version never checks SSL certs; skipping tests")
-    sys.exit(0)
-
-
 EXC = OSError

 print(f"Connecting to {GOOD_SSL} should work")
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -29,9 +29,6 @@ env
 # https://github.com/pytorch/pytorch/blob/0b6c0898e6c352c8ea93daec854e704b41485375/.ci/docker/common/install_cache.sh#L97
 export PATH="/opt/cache/lib:$PATH"

-# Turn off -MD / -MMD compiler flags to increase sccache hit rate
-export COMPILE_NO_MD=1
-
 if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
  # Use jemalloc during compilation to mitigate https://github.com/pytorch/pytorch/issues/116289
  export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
@ -236,7 +233,9 @@ if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi

-if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *-full-debug* ]]; then
+  export CMAKE_BUILD_TYPE=Debug
+elif [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
  export CMAKE_BUILD_TYPE=RelWithAssert
 fi

@ -292,15 +291,8 @@ else
      python -mpip install numpy==2.0.2

      WERROR=1 python setup.py clean
-      sccache --stop-server
-      export SCCACHE_LOG_LEVEL=debug
-      export SCCACHE_ERROR_LOG=/tmp/sccache_errors.log
-      export SCCACHE_LOG=debug
-      export RUST_LOG=sccache::server=debug
-      sccache --start-server

      WERROR=1 python -m build --wheel --no-isolation
-      mv /tmp/sccache_errors.log dist/
    else
      python setup.py clean
      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
@ -309,6 +301,11 @@ else
      python -m build --wheel --no-isolation
    fi
    pip_install_whl "$(echo dist/*.whl)"
+    if [[ "$BUILD_ENVIRONMENT" == *full-debug* ]]; then
+      # Regression test for https://github.com/pytorch/pytorch/issues/164297
+      # Torch should be importable and that's about it
+      pushd /; python -c "import torch;print(torch.__config__.show(), torch.randn(5) + 1.7)"; popd
+    fi

    if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *vision* ]]; then
      install_torchvision
--- a/.ci/pytorch/common-build.sh
+++ b/.ci/pytorch/common-build.sh
@ -2,8 +2,6 @@
 # Required environment variables:
 #   $BUILD_ENVIRONMENT (should be set by your Docker image)

-set -e -x -o pipefail
-
 if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
    # Save the absolute path in case later we chdir (as occurs in the gpu perf test)
    script_dir="$( cd "$(dirname "${BASH_SOURCE[0]}")" || exit ; pwd -P )"
@ -47,14 +45,14 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
            # explicitly
            echo "Skipping sccache server initialization, setting environment variables"
            export SCCACHE_IDLE_TIMEOUT=0
-            export SCCACHE_ERROR_LOG=/tmp/sccache_error.log
-            export RUST_LOG=sccache::server=debug
+            export SCCACHE_ERROR_LOG=~/sccache_error.log
+            export RUST_LOG=sccache::server=error
        elif [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
-            SCCACHE_ERROR_LOG=/tmp/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server
+            SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server
        else
            # increasing SCCACHE_IDLE_TIMEOUT so that extension_backend_test.cpp can build after this PR:
            # https://github.com/pytorch/pytorch/pull/16645
-            SCCACHE_ERROR_LOG=/tmp/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 RUST_LOG=sccache::server=error sccache --start-server
+            SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 RUST_LOG=sccache::server=error sccache --start-server
        fi

        # Report sccache stats for easier debugging. It's ok if this commands
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -256,7 +256,7 @@ test_torchbench_smoketest() {
  local device=mps
  local dtypes=(undefined float16 bfloat16 notset)
  local dtype=${dtypes[$1]}
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
+  local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)

  for backend in eager inductor; do

@ -319,7 +319,7 @@ test_aoti_torchbench_smoketest() {
  local device=mps
  local dtypes=(undefined float16 bfloat16 notset)
  local dtype=${dtypes[$1]}
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
+  local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)

  echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
  local dtype_arg="--${dtype}"
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -337,13 +337,13 @@ test_python() {

 test_python_smoke() {
  # Smoke tests for H100/B200
-  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

 test_python_smoke_b200() {
  # Targeted smoke tests for B200 - staged approach to avoid too many failures
-  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

@ -838,7 +838,7 @@ test_dynamo_benchmark() {
      elif [[ "${suite}" == "timm_models" ]]; then
        export TORCHBENCH_ONLY_MODELS="inception_v3"
      elif [[ "${suite}" == "torchbench" ]]; then
-        export TORCHBENCH_ONLY_MODELS="hf_Bert"
+        export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
      fi
    fi
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
@ -869,13 +869,13 @@ test_inductor_torchbench_smoketest_perf() {
  mkdir -p "$TEST_REPORTS_DIR"

  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
-    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
+    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \
    --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
  # The threshold value needs to be actively maintained to make this check useful
  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4

  # Check memory compression ratio for a few models
-  for test in hf_Albert timm_vision_transformer; do
+  for test in BERT_pytorch yolov3; do
    python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
      --disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
      --only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
--- a/.ci/pytorch/win-test-helpers/test_libtorch.bat
+++ b/.ci/pytorch/win-test-helpers/test_libtorch.bat
@ -15,37 +15,35 @@ if errorlevel 1 exit /b 1
 if not errorlevel 0 exit /b 1

 cd %TMP_DIR_WIN%\build\torch\test
+
+:: Enable delayed variable expansion to make the list
+setlocal enabledelayedexpansion
+set EXE_LIST=
 for /r "." %%a in (*.exe) do (
-    call :libtorch_check "%%~na" "%%~fa"
+  if "%%~na" == "c10_intrusive_ptr_benchmark" (
+    @REM NB: This is not a gtest executable file, thus couldn't be handled by
+    @REM pytest-cpp and is excluded from test discovery by run_test
+    call "%%~fa"
    if errorlevel 1 goto fail
+    if not errorlevel 0 goto fail
+  ) else (
+    if "%%~na" == "verify_api_visibility" (
+      @REM Skip verify_api_visibility as it is a compile-level test
+    ) else (
+      set EXE_LIST=!EXE_LIST! cpp/%%~na
+    )
+  )
 )

-goto :eof
-
-:libtorch_check
-
 cd %CWD%
 set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test

-:: Skip verify_api_visibility as it a compile level test
-if "%~1" == "verify_api_visibility" goto :eof
+:: Run python test\run_test.py on the list
+set NO_TD=True && python test\run_test.py --cpp --verbose -i !EXE_LIST!
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail

-echo Running "%~2"
-if "%~1" == "c10_intrusive_ptr_benchmark" (
-  :: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp
-  call "%~2"
-  goto :eof
-)
-
-python test\run_test.py --cpp --verbose -i "cpp/%~1"
-if errorlevel 1 (
-  echo %1 failed with exit code %errorlevel%
-  goto fail
-)
-if not errorlevel 0 (
-  echo %1 failed with exit code %errorlevel%
-  goto fail
-)
+goto :eof

 :eof
 exit /b 0
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -71,14 +71,7 @@ export PYTORCH_BUILD_NUMBER=1

 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
-
-# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
-TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
-
-# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
-if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
-  TRITON_CONSTRAINT="platform_system == 'Linux'"
-fi
+TRITON_CONSTRAINT="platform_system == 'Linux'"

 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
--- a/.flake8
+++ b/.flake8
@ -12,7 +12,7 @@ ignore =
    # to line this up with executable bit
    EXE001,
    # these ignores are from flake8-bugbear; please fix!
-    B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907,B908,B910
+    B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
    # these ignores are from flake8-comprehensions; please fix!
    C407,
    # these ignores are from flake8-logging-format; please fix!
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -274,8 +274,6 @@ runs:
          -w /var/lib/jenkins/workspace \
          "${DOCKER_IMAGE}"
        )
-        # Propagate download.pytorch.org IP to container
-        grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
        echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
        docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"

--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@ -111,3 +111,16 @@ runs:
        # This video group ID maps to subgid 1 inside the docker image due to the /etc/subgid entries.
        # The group name corresponding to group ID 1 can change depending on the OS, so both are necessary.
        echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd $DEVICE_FLAG --group-add video --group-add $render_gid --group-add daemon --group-add bin --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --network=host" >> "${GITHUB_ENV}"
+
+    - name: configure aws credentials
+      id: aws_creds
+      uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+      with:
+        role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+        aws-region: us-east-1
+        role-duration-seconds: 18000
+
+    - name: Login to Amazon ECR
+      id: login-ecr
+      continue-on-error: true
+      uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -33,10 +33,6 @@ runs:
        )

        echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
-        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
-          # Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
-          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
-        fi

        docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
        # Generate test script
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -30,6 +30,7 @@ ciflow_push_tags:
 - ciflow/riscv64
 - ciflow/rocm
 - ciflow/rocm-mi300
+- ciflow/rocm-mi355
 - ciflow/s390
 - ciflow/slow
 - ciflow/torchbench
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -177,6 +177,9 @@ jobs:
    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config) }}
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -389,8 +389,6 @@ jobs:
            "${DOCKER_IMAGE}" \
            ${DOCKER_SHELL_CMD}
          )
-          # Propagate download.pytorch.org IP to container
-          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
          echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"

          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -102,19 +102,6 @@ jobs:
            exit 1
          fi

-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-
-      - name: Login to Amazon ECR
-        id: login-ecr
-        continue-on-error: true
-        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
-
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -358,6 +358,9 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -473,6 +476,9 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -347,6 +347,9 @@ jobs:
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.10"
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -459,6 +462,9 @@ jobs:
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
      DESIRED_PYTHON: "3.10"
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -941,6 +947,9 @@ jobs:
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.11"
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -1053,6 +1062,9 @@ jobs:
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
      DESIRED_PYTHON: "3.11"
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -1535,6 +1547,9 @@ jobs:
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.12"
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -1647,6 +1662,9 @@ jobs:
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
      DESIRED_PYTHON: "3.12"
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -2129,6 +2147,9 @@ jobs:
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.13"
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -2241,6 +2262,9 @@ jobs:
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
      DESIRED_PYTHON: "3.13"
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -2723,6 +2747,9 @@ jobs:
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.13t"
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -2835,6 +2862,9 @@ jobs:
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
      DESIRED_PYTHON: "3.13t"
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -3317,6 +3347,9 @@ jobs:
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.14"
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -3429,6 +3462,9 @@ jobs:
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
      DESIRED_PYTHON: "3.14"
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -3911,6 +3947,9 @@ jobs:
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.14t"
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -4023,6 +4062,9 @@ jobs:
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
      DESIRED_PYTHON: "3.14t"
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
--- a/.github/workflows/h100-distributed.yml
+++ b/.github/workflows/h100-distributed.yml
@ -37,7 +37,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: "linux.12xlarge"
+      runner: "linux.c7i.12xlarge"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '9.0'
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -130,7 +130,7 @@ jobs:
    name: test-periodically
    uses: ./.github/workflows/_linux-test.yml
    needs: build
-    if: github.event.schedule == '15 0,12 * * 1-6'
+    if: github.event.schedule == '15 0 * * 1-6'
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -12,6 +12,7 @@ on:
      - landchecks/*
    tags:
      - ciflow/pull/*
+      - ciflow/trunk/*
  workflow_dispatch:

 permissions: read-all
@ -32,10 +33,12 @@ jobs:
    name: Get changed files
    uses: ./.github/workflows/_get-changed-files.yml
    with:
-      all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}
+      all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') || github.event_name == 'push' }}

  lintrunner-clang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    # Needed to prevent deduping on HUD
+    name: lintrunner-clang-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
    needs: [get-label-type, get-changed-files]
    # Only run if there are changed files relevant to clangtidy / clangformat
    if: |
@ -75,6 +78,7 @@ jobs:
  #       fails to find types when it should
  lintrunner-mypy:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    name: lintrunner-mypy-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
    needs: [get-label-type, get-changed-files]
    # Only run if there are changed files relevant to mypy
    if: |
@ -99,6 +103,7 @@ jobs:

  lintrunner-noclang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    name: lintrunner-noclang-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
    needs: [get-label-type, get-changed-files]
    with:
      timeout: 120
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -182,11 +182,11 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -1,16 +1,16 @@
 name: pull

 on:
-  # pull_request:
-  #   branches-ignore:
-  #     - nightly
-  # push:
-  #   branches:
-  #     - main
-  #     - release/*
-  #     - landchecks/*
-  #   tags:
-  #     - ciflow/pull/*
+  pull_request:
+    branches-ignore:
+      - nightly
+  push:
+    branches:
+      - main
+      - release/*
+      - landchecks/*
+    tags:
+      - ciflow/pull/*
  workflow_dispatch:
  schedule:
    - cron: 29 8 * * *  # about 1:29am PDT
@ -127,6 +127,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
+      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/rocm-mi355.yml
+++ b/.github/workflows/rocm-mi355.yml
@ -1,6 +1,9 @@
 name: rocm-mi355

 on:
+  push:
+    tags:
+      - ciflow/rocm-mi355/*
  workflow_dispatch:
  schedule:
    - cron: 30 11,1 * * *  # about 4:30am PDT and 6:30pm PDT
@ -64,5 +67,7 @@ jobs:
      build-environment: linux-noble-rocm-py3.12-mi355
      docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
-      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
+      tests-to-include: >-
+                        ${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor test_matmul_cuda test_scaled_matmul_cuda'
+                           || '' }}
    secrets: inherit
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -140,6 +140,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
+      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -47,6 +47,22 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

+  libtorch-linux-jammy-cuda12_8-py3_10-gcc11-debug-build:
+    name: libtorch-linux-jammy-cuda12.8-py3.10-gcc11-debug
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: libtorch-linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      build-generates-artifacts: false
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: "linux.c7i.4xlarge"
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+    secrets: inherit
+
  linux-jammy-cuda12_8-py3_10-gcc11-build:
    name: linux-jammy-cuda12.8-py3.10-gcc11
    uses: ./.github/workflows/_linux-build.yml
@ -69,3 +85,178 @@ jobs:
          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
        ]}
    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-build
+      - target-determination
+    with:
+      timeout-minutes: 360
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
+    secrets: inherit
+
+
+  # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
+  linux-jammy-cuda12_8-py3_10-gcc11-no-ops-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-no-ops
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-no-ops
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+    secrets: inherit
+
+  macos-py3-arm64-build:
+    if: github.repository_owner == 'pytorch'
+    name: macos-py3-arm64
+    uses: ./.github/workflows/_mac-build.yml
+    with:
+      sync-tag: macos-py3-arm64-build
+      build-environment: macos-py3-arm64
+      runner-type: macos-m1-stable
+      build-generates-artifacts: true
+      # To match the one pre-installed in the m1 runners
+      python-version: 3.12.7
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
+          { config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" },
+          { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
+        ]}
+    secrets: inherit
+
+  macos-py3-arm64-test:
+    name: macos-py3-arm64
+    uses: ./.github/workflows/_mac-test.yml
+    needs:
+      - macos-py3-arm64-build
+      - target-determination
+    with:
+      build-environment: macos-py3-arm64
+      # Same as the build job
+      python-version: 3.12.7
+      test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }}
+      disable-monitor: false
+    secrets: inherit
+
+  win-vs2022-cpu-py3-build:
+    name: win-vs2022-cpu-py3
+    uses: ./.github/workflows/_win-build.yml
+    needs: get-label-type
+    with:
+      build-environment: win-vs2022-cpu-py3
+      cuda-version: cpu
+      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+        ]}
+    secrets: inherit
+
+  win-vs2022-cpu-py3-test:
+    name: win-vs2022-cpu-py3
+    uses: ./.github/workflows/_win-test.yml
+    needs:
+      - win-vs2022-cpu-py3-build
+      - target-determination
+    with:
+      build-environment: win-vs2022-cpu-py3
+      cuda-version: cpu
+      test-matrix: ${{ needs.win-vs2022-cpu-py3-build.outputs.test-matrix }}
+      disable-monitor: false
+    secrets: inherit
+
+  win-vs2022-cuda12_6-py3-build:
+    name: win-vs2022-cuda12.6-py3
+    uses: ./.github/workflows/_win-build.yml
+    needs: get-label-type
+    with:
+      build-environment: win-vs2022-cuda12.6-py3
+      cuda-version: "12.6"
+      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+    secrets: inherit
+
+  inductor-build:
+    name: inductor-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+    secrets: inherit
+
+  verify-cachebench-cpu-build:
+    name: verify-cachebench-cpu-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      test-matrix: |
+        { include: [
+          { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+        ]}
+    secrets: inherit
+
+  verify-cachebench-cpu-test:
+    name: verify-cachebench-cpu-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - verify-cachebench-cpu-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-py3-clang12-executorch-build:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
+      test-matrix: |
+        { include: [
+          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3-clang12-executorch-test:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3-clang12-executorch-build
+    with:
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-py3_10-gcc11-full-debug-build-only:
+    name: linux-jammy-py3.10-gcc11-full-debug-build-only
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.2xlarge.memory
+      build-environment: linux-jammy-py3.10-gcc11-full-debug-build-only
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+    secrets: inherit
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -35,7 +35,7 @@ jobs:
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
      build-environment: linux-jammy-xpu-n-1-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
-      runner: linux.12xlarge
+      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
@ -56,7 +56,7 @@ jobs:
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
      build-environment: linux-jammy-xpu-n-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
-      runner: linux.12xlarge
+      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -388,9 +388,9 @@ cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker

 option(USE_MIMALLOC "Use mimalloc" OFF)
 # Enable third party mimalloc library to improve memory allocation performance
-# on Windows.
+# on Windows and AArch64.
 option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF)
-if(WIN32)
+if(WIN32 OR (CPU_AARCH64 AND NOT APPLE))
  set(USE_MIMALLOC ON)

  # Not enable USE_MIMALLOC_ON_MKL due to it caused issue:
@ -420,14 +420,6 @@ if(USE_CCACHE)
  endif()
 endif()

-# Optionally disable -MD / -MMD if COMPILE_NO_MD is set in the environment
-if(DEFINED ENV{COMPILE_NO_MD})
-  message(STATUS "COMPILE_NO_MD is set — disabling compiler dependency file flags (-MD/-MMD)")
-  foreach(lang C CXX CUDA HIP ASM)
-    set(CMAKE_DEPFILE_FLAGS_${lang} "")
-  endforeach()
-endif()
-
 # Since TensorPipe does not support Windows, set it to OFF when WIN32 detected
 # On Windows platform, if user does not install libuv in build conda env and
 # does not set libuv_ROOT environment variable. Set USE_DISTRIBUTED to OFF.
@ -1495,10 +1487,3 @@ else()
    ]])
  endif()
 endif()
-
-foreach(lang C CXX CUDA)
-  foreach(flg "" "_DEBUG" "_RELEASE" "_RELWITHDEBINFO")
-    string(REPLACE "-MD" "" CMAKE_${lang}_FLAGS${flg} "${CMAKE_${lang}_FLAGS${flg}}")
-    string(REPLACE "-MMD" "" CMAKE_${lang}_FLAGS${flg} "${CMAKE_${lang}_FLAGS${flg}}")
-  endforeach()
-endforeach()
--- a/aten/src/ATen/BlasBackend.h
+++ b/aten/src/ATen/BlasBackend.h
@ -28,4 +28,19 @@ inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) {
  return stream << BlasBackendToString(backend);
 }

+namespace blas {
+
+enum class ScalingType : std::uint8_t {
+  TensorWise, // fp32 scales
+  RowWise, // fp32 scales
+  BlockWise1x16, // fp8_e4m3fn scales
+  BlockWise1x32, // fp8_e8m0fnu scales
+  BlockWise1x128, // fp32 scales
+  BlockWise128x128, // fp32 scales
+};
+
+enum class SwizzleType : std::uint8_t { NO_SWIZZLE = 0, SWIZZLE_32_4_4 = 1 };
+
+} // namespace blas
+
 } // namespace at
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -226,15 +226,15 @@ class TORCH_API Context {
  bool userEnabledMkldnn() const;
  void setUserEnabledMkldnn(bool e);
  bool benchmarkCuDNN() const;
-  void setBenchmarkCuDNN(bool);
+  void setBenchmarkCuDNN(bool /*b*/);
  int benchmarkLimitCuDNN() const;
-  void setBenchmarkLimitCuDNN(int);
+  void setBenchmarkLimitCuDNN(int /*b*/);
  bool immediateMiopen() const;
-  void setImmediateMiopen(bool);
+  void setImmediateMiopen(bool /*b*/);
  bool deterministicCuDNN() const;
-  void setDeterministicCuDNN(bool);
+  void setDeterministicCuDNN(bool /*b*/);
  bool deterministicMkldnn() const;
-  void setDeterministicMkldnn(bool);
+  void setDeterministicMkldnn(bool /*b*/);
  bool userEnabledNNPACK() const;
  void setUserEnabledNNPACK(bool e);

@ -252,32 +252,32 @@ class TORCH_API Context {
  void setSDPPriorityOrder(const std::vector<int64_t>& order);
  std::array<at::SDPBackend, at::num_sdp_backends> sDPPriorityOrder();

-  void setSDPUseFlash(bool);
+  void setSDPUseFlash(bool /*e*/);
  bool userEnabledFlashSDP() const;

-  void setSDPUseMemEfficient(bool);
+  void setSDPUseMemEfficient(bool /*e*/);
  bool userEnabledMemEfficientSDP() const;

-  void setSDPUseMath(bool);
+  void setSDPUseMath(bool /*e*/);
  bool userEnabledMathSDP() const;

-  void setSDPUseCuDNN(bool);
+  void setSDPUseCuDNN(bool /*e*/);
  bool userEnabledCuDNNSDP() const;

-  void setAllowFP16BF16ReductionMathSDP(bool);
+  void setAllowFP16BF16ReductionMathSDP(bool /*e*/);
  bool allowFP16BF16ReductionMathSDP() const;

-  void setSDPUseOverrideable(bool);
+  void setSDPUseOverrideable(bool /*e*/);
  bool userEnabledOverrideableSDP() const;

  at::LinalgBackend linalgPreferredBackend() const;
-  void setLinalgPreferredBackend(at::LinalgBackend);
+  void setLinalgPreferredBackend(at::LinalgBackend /*b*/);

  at::BlasBackend blasPreferredBackend();
-  void setBlasPreferredBackend(at::BlasBackend);
+  void setBlasPreferredBackend(at::BlasBackend /*b*/);

  at::ROCmFABackend getROCmFAPreferredBackend();
-  void setROCmFAPreferredBackend(at::ROCmFABackend);
+  void setROCmFAPreferredBackend(at::ROCmFABackend /*b*/);

  // Note [Enabling Deterministic Operations]
  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -310,9 +310,9 @@ class TORCH_API Context {

  bool deterministicAlgorithms() const;
  bool deterministicAlgorithmsWarnOnly() const;
-  void setDeterministicAlgorithms(bool, bool);
+  void setDeterministicAlgorithms(bool /*b*/, bool /*warn_only*/);
  bool deterministicFillUninitializedMemory() const;
-  void setDeterministicFillUninitializedMemory(bool);
+  void setDeterministicFillUninitializedMemory(bool /*b*/);

  // Note [Writing Nondeterministic Operations]
  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -356,11 +356,11 @@ class TORCH_API Context {
      Float32Op op,
      Float32Precision p);
  bool allowTF32CuDNN(std::optional<Float32Op> op = std::nullopt) const;
-  void setAllowTF32CuDNN(bool);
+  void setAllowTF32CuDNN(bool /*b*/);
  bool allowTF32OneDNN() const;
-  void setAllowTF32OneDNN(bool);
+  void setAllowTF32OneDNN(bool /*b*/);
  bool allowTF32CuBLAS() const;
-  void setAllowTF32CuBLAS(bool);
+  void setAllowTF32CuBLAS(bool /*b*/);
  Float32MatmulPrecision float32MatmulPrecision() const;
  Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
  CuBLASReductionOption allowFP16ReductionCuBLAS() const;
@ -372,7 +372,7 @@ class TORCH_API Context {
      bool allow_reduced_precision,
      bool allow_splitk = true);
  bool allowFP16AccumulationCuBLAS() const;
-  void setAllowFP16AccumulationCuBLAS(bool);
+  void setAllowFP16AccumulationCuBLAS(bool /*b*/);

  // Matmuls can use a so-called "persistent" kernel which launches one CUDA
  // block for each SM on the GPU, and each block then iterates over multiple
@ -384,7 +384,7 @@ class TORCH_API Context {
  // to make matmuls target only a subset of the SMs, so they can fully schedule
  // even next to a comms kernel, and only be a few percent slower.
  std::optional<int32_t> _SMCarveout_EXPERIMENTAL() const;
-  void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t>);
+  void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t> /*c*/);

  at::QEngine qEngine() const;
  void setQEngine(at::QEngine e);
@ -405,7 +405,7 @@ class TORCH_API Context {
  void setDefaultMobileCPUAllocator();
  void unsetDefaultMobileCPUAllocator();
  bool allowFP16ReductionCPU() const;
-  void setAllowFP16ReductionCPU(bool);
+  void setAllowFP16ReductionCPU(bool /*b*/);

  // Preserved for BC
  void lazyInitCUDA() {
--- a/aten/src/ATen/EmptyTensor.h
+++ b/aten/src/ATen/EmptyTensor.h
@ -16,8 +16,8 @@ inline void check_size_nonnegative(ArrayRef<int64_t> size) {

 inline void check_size_nonnegative(ArrayRef<c10::SymInt> size) {
  for (const auto& x : size) {
-    TORCH_CHECK(
-        x.expect_size(__FILE__, __LINE__),
+    TORCH_SYM_CHECK(
+        x.sym_ge(0),
        "Trying to create tensor with negative dimension ",
        x,
        ": ",
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@ -4,6 +4,7 @@
 #include <c10/core/ScalarType.h>
 #include <c10/core/SymIntArrayRef.h>
 #include <c10/util/DimVector.h>
+#include <c10/util/Exception.h>
 #include <optional>
 #include <sstream>
 #include <vector>
@ -26,9 +27,7 @@ inline void infer_size_impl(
  std::optional<int64_t> infer_dim;
  for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
    if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) {
-      if (infer_dim) {
-        throw std::runtime_error("only one dimension can be inferred");
-      }
+      TORCH_CHECK(!infer_dim, "only one dimension can be inferred");
      infer_dim = dim;
    } else {
      // in case of unbacked shape[dim] we assume it's not -1 and add a runtime
--- a/aten/src/ATen/MapAllocator.cpp
+++ b/aten/src/ATen/MapAllocator.cpp
@ -62,7 +62,7 @@ constexpr const char* unknown_eventname = "eventname not specified";
 #endif
 }  // namespace (anonymous)

-MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags, size_t size)
+MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd, int flags, size_t size)
  : filename_(filename.empty() ? unknown_filename : filename)
  , size_(0) // to be filled later
 #ifdef _WIN32
@ -494,7 +494,7 @@ RefcountedMapAllocator::RefcountedMapAllocator(const char *filename, int flags,

    initializeAlloc();
 }
-RefcountedMapAllocator::RefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size)
+RefcountedMapAllocator::RefcountedMapAllocator(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size)
  : RefcountedMapAllocatorArgCheck(flags)
  , MapAllocator(WITH_FD, filename, flags, fd, size + map_alloc_alignment) {

@ -614,7 +614,7 @@ at::DataPtr MapAllocator::makeDataPtr(std::string_view filename, int flags, size
  return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
 }

-at::DataPtr MapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
+at::DataPtr MapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
  auto* context = new MapAllocator(WITH_FD, filename, fd, flags, size);
  if (actual_size_out) *actual_size_out = context->size();
  return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
@ -626,7 +626,7 @@ at::DataPtr RefcountedMapAllocator::makeDataPtr(const char *filename, int flags,
  return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
 }

-at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
+at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
  auto* context = new RefcountedMapAllocator(WITH_FD, filename, fd, flags, size);
  if (actual_size_out) *actual_size_out = context->size() - map_alloc_alignment;
  return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
--- a/aten/src/ATen/MapAllocator.h
+++ b/aten/src/ATen/MapAllocator.h
@ -25,7 +25,7 @@ class TORCH_API MapAllocator {
 public:
  MapAllocator(std::string_view filename, int flags, size_t size);
  MapAllocator(
-      WithFd,
+      WithFd /*unused*/,
      std::string_view filename,
      int fd,
      int flags,
@ -59,14 +59,14 @@ class TORCH_API MapAllocator {
    return flags_;
  }

-  static MapAllocator* fromDataPtr(const at::DataPtr&);
+  static MapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
  static at::DataPtr makeDataPtr(
      std::string_view filename,
      int flags,
      size_t size,
      size_t* actual_size_out);
  static at::DataPtr makeDataPtr(
-      WithFd,
+      WithFd /*unused*/,
      const char* filename,
      int fd,
      int flags,
@ -105,13 +105,13 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
 public:
  RefcountedMapAllocator(const char* filename, int flags, size_t size);
  RefcountedMapAllocator(
-      WithFd,
+      WithFd /*unused*/,
      const char* filename,
      int fd,
      int flags,
      size_t size);

-  static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&);
+  static RefcountedMapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
  RefcountedMapAllocator(const RefcountedMapAllocator&) = delete;
  RefcountedMapAllocator(RefcountedMapAllocator&&) = delete;
  RefcountedMapAllocator& operator=(const RefcountedMapAllocator&) = delete;
@ -122,7 +122,7 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
      size_t size,
      size_t* actual_size_out);
  static at::DataPtr makeDataPtr(
-      WithFd,
+      WithFd /*unused*/,
      const char* filename,
      int fd,
      int flags,
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@ -273,7 +273,7 @@ c10::SymInt NestedTensorImpl::sym_numel_custom() const {
  return NestedTensorImpl::numel_custom();
 }

-c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
+c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
  return nested_tensor_impl_is_contiguous(this);
 }
 IntArrayRef NestedTensorImpl::sizes_custom() const {
--- a/aten/src/ATen/NestedTensorImpl.h
+++ b/aten/src/ATen/NestedTensorImpl.h
@ -115,7 +115,8 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
  // with real implementations
  int64_t numel_custom() const override;
  c10::SymInt sym_numel_custom() const override;
-  c10::SymBool sym_is_contiguous_custom(MemoryFormat) const override;
+  c10::SymBool sym_is_contiguous_custom(
+      MemoryFormat /*memory_format*/) const override;
  int64_t size_custom(int64_t d) const override {
    return this->size(d);
  }
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@ -14,7 +14,7 @@ inline int64_t divup(int64_t x, int64_t y) {
 TORCH_API void init_num_threads();

 // Sets the number of threads to be used in parallel region
-TORCH_API void set_num_threads(int);
+TORCH_API void set_num_threads(int /*nthreads*/);

 // Returns the maximum number of threads that may be used in a parallel region
 TORCH_API int get_num_threads();
@ -37,7 +37,7 @@ inline void lazy_init_num_threads() {
  }
 }

-TORCH_API void set_thread_num(int);
+TORCH_API void set_thread_num(int /*id*/);

 class TORCH_API ThreadIdGuard {
 public:
@ -130,7 +130,7 @@ inline scalar_t parallel_reduce(
 TORCH_API std::string get_parallel_info();

 // Sets number of threads used for inter-op parallelism
-TORCH_API void set_num_interop_threads(int);
+TORCH_API void set_num_interop_threads(int /*nthreads*/);

 // Returns the number of threads used for inter-op parallelism
 TORCH_API size_t get_num_interop_threads();
--- a/aten/src/ATen/SparseCsrTensorImpl.cpp
+++ b/aten/src/ATen/SparseCsrTensorImpl.cpp
@ -252,7 +252,7 @@ void SparseCsrTensorImpl::set_stride(int64_t dim, int64_t new_stride) {
 void SparseCsrTensorImpl::set_storage_offset(int64_t storage_offset) {
  TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_storage_offset.");
 }
-c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
+c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
  TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have is_contiguous");
 }
 } // namespace at
--- a/aten/src/ATen/SparseCsrTensorImpl.h
+++ b/aten/src/ATen/SparseCsrTensorImpl.h
@ -32,10 +32,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {

 public:
  explicit SparseCsrTensorImpl(
-      at::DispatchKeySet,
+      at::DispatchKeySet /*key_set*/,
      at::Device device,
      Layout layout,
-      const caffe2::TypeMeta);
+      const caffe2::TypeMeta /*data_type*/);

  void resize_(int64_t nnz, IntArrayRef size);
  void resize_and_clear_(
@ -86,7 +86,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
 protected:
  IntArrayRef strides_custom() const override;
  SymIntArrayRef sym_strides_custom() const override;
-  SymBool sym_is_contiguous_custom(MemoryFormat) const override;
+  SymBool sym_is_contiguous_custom(
+      MemoryFormat /*memory_format*/) const override;

 public:
  void set_size(int64_t dim, int64_t new_size) override;
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -46,7 +46,9 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {

 public:
  // Public for now...
-  explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta);
+  explicit SparseTensorImpl(
+      at::DispatchKeySet /*key_set*/,
+      const caffe2::TypeMeta /*data_type*/);

  void release_resources() override;

@ -384,8 +386,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {

 private:
  explicit SparseTensorImpl(
-      at::DispatchKeySet,
-      const caffe2::TypeMeta,
+      at::DispatchKeySet /*key_set*/,
+      const caffe2::TypeMeta /*data_type*/,
      at::Tensor indices,
      at::Tensor values);

--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -112,10 +112,10 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice);
 // `torch.tensor([1, 2])`) | `torch::tensor({1, 2})`
 struct TORCH_API TensorIndex final {
  // Case 1: `at::indexing::None`
-  TensorIndex(std::nullopt_t) : type_(TensorIndexType::None) {}
+  TensorIndex(std::nullopt_t /*unused*/) : type_(TensorIndexType::None) {}

  // Case 2: "..." / `at::indexing::Ellipsis`
-  TensorIndex(at::indexing::EllipsisIndexType)
+  TensorIndex(at::indexing::EllipsisIndexType /*unused*/)
      : type_(TensorIndexType::Ellipsis) {}
  TensorIndex(const char* str) : TensorIndex(at::indexing::Ellipsis) {
    TORCH_CHECK_VALUE(
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@ -250,7 +250,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
  using PtrVector = SmallVector<char*, 4>;
  using StrideVector = SmallVector<int64_t, 6>;

-  void build(TensorIteratorConfig&);
+  void build(TensorIteratorConfig& /*config*/);

  // The inner-loop function operates on the fastest moving dimension. It
  // implements element-wise operations in terms of 1-d strided tensors.
@ -618,20 +618,20 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
 #undef TORCH_DISALLOW_TEMPORARIES
 protected:
  // Mutable reference as it moves tensors out of TensorIteratorConfig
-  void populate_operands(TensorIteratorConfig&);
+  void populate_operands(TensorIteratorConfig& /*config*/);
  void mark_outputs();
-  void mark_resize_outputs(const TensorIteratorConfig&);
-  void compute_mem_overlaps(const TensorIteratorConfig&);
-  void compute_shape(const TensorIteratorConfig&);
-  void compute_strides(const TensorIteratorConfig&);
+  void mark_resize_outputs(const TensorIteratorConfig& /*config*/);
+  void compute_mem_overlaps(const TensorIteratorConfig& /*config*/);
+  void compute_shape(const TensorIteratorConfig& /*config*/);
+  void compute_strides(const TensorIteratorConfig& /*config*/);
  void reorder_dimensions();
  void permute_dimensions(IntArrayRef perm);
-  void compute_types(const TensorIteratorConfig&);
+  void compute_types(const TensorIteratorConfig& /*config*/);
  ScalarType compute_common_dtype();
  void allocate_or_resize_outputs();
-  bool fast_set_up(const TensorIteratorConfig&);
-  FastSetupType compute_fast_setup_type(const TensorIteratorConfig&);
-  void compute_names(const TensorIteratorConfig&);
+  bool fast_set_up(const TensorIteratorConfig& /*config*/);
+  FastSetupType compute_fast_setup_type(const TensorIteratorConfig& /*config*/);
+  void compute_names(const TensorIteratorConfig& /*config*/);
  void propagate_names_to_outputs();
  void coalesce_dimensions();

--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@ -20,7 +20,7 @@

 namespace at {

-TORCH_API int _crash_if_asan(int);
+TORCH_API int _crash_if_asan(int /*arg*/);

 // Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
 // NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@ -103,9 +103,7 @@ std::string get_cpu_capability() {
 #elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
    case native::CPUCapability::ZVECTOR:
      return "Z VECTOR";
-#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
-    case native::CPUCapability::SVE128:
-      return "SVE128";
+#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
    case native::CPUCapability::SVE256:
      return "SVE256";
 #else
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@ -148,7 +148,7 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_
 Banned functions
 *******************************/

-static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const std::optional<Tensor>&, int64_t) {
+static Tensor binary_cross_entropy_banned(const Tensor & /*unused*/, const Tensor & /*unused*/, const std::optional<Tensor>& /*unused*/, int64_t /*unused*/) {
  TORCH_CHECK(false, "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
           "Many models use a sigmoid layer right before the binary cross entropy layer.\n"
           "In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n"
--- a/aten/src/ATen/core/NamedTensor.h
+++ b/aten/src/ATen/core/NamedTensor.h
@ -27,11 +27,11 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
    HasNonWildcard
  };

-  explicit NamedTensorMeta(HAS_NON_WILDCARD, DimnameList names)
+  explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, DimnameList names)
    : names_(names.vec()) {
    check_invariants();
  }
-  explicit NamedTensorMeta(HAS_NON_WILDCARD, std::vector<Dimname>&& names)
+  explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& names)
    : names_(std::move(names)) {
    check_invariants();
  }
@ -52,13 +52,13 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
      std::any_of(names_.begin(), names_.end(), [](const Dimname& n) { return !n.isWildcard(); }));
  }

-  void set_names(HAS_NON_WILDCARD, DimnameList new_names) {
+  void set_names(HAS_NON_WILDCARD /*unused*/, DimnameList new_names) {
    TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
    std::copy(new_names.begin(), new_names.end(), names_.begin());
    check_invariants();
  }

-  void set_names(HAS_NON_WILDCARD, std::vector<Dimname>&& new_names) {
+  void set_names(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& new_names) {
    TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
    names_ = std::move(new_names);
    check_invariants();
--- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
@ -13,7 +13,7 @@ class TORCH_API PythonOpRegistrationTrampoline final {
 public:
  //  Returns true if you successfully registered yourself (that means
  //  you are in the hot seat for doing the operator registrations!)
-  static bool registerInterpreter(c10::impl::PyInterpreter*);
+  static bool registerInterpreter(c10::impl::PyInterpreter* /*interp*/);

  // Returns nullptr if no interpreter has been registered yet.
  static c10::impl::PyInterpreter* getInterpreter();
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -100,7 +100,7 @@ class TORCH_API TensorBase {
  // Create a Tensor with a +0 reference count. Special care must be
  // taken to avoid decrementing this reference count at destruction
  // time. Intended to support MaybeOwnedTraits<Tensor>.
-  explicit TensorBase(unsafe_borrow_t, const TensorBase& rhs)
+  explicit TensorBase(unsafe_borrow_t /*unused*/, const TensorBase& rhs)
      : impl_(c10::intrusive_ptr<at::TensorImpl, UndefinedTensorImpl>(rhs.impl_.get(), c10::raw::DontIncreaseRefcount{})) {}
  friend MaybeOwnedTraits<TensorBase>;

@ -954,7 +954,7 @@ protected:
  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;

 private:
-  TensorBase __dispatch_contiguous(c10::MemoryFormat) const;
+  TensorBase __dispatch_contiguous(c10::MemoryFormat /*memory_format*/) const;
 };

 inline DeviceIndex get_device(const TensorBase& self) {
--- a/aten/src/ATen/core/boxing/BoxedKernel.h
+++ b/aten/src/ATen/core/boxing/BoxedKernel.h
@ -18,10 +18,10 @@ class KernelFunction;
 // implementation notes; notably, this does NOT actually go through the
 // boxing/unboxing codepath.
 TORCH_API void fallthrough_kernel(
-    OperatorKernel*,
-    const OperatorHandle&,
-    DispatchKeySet,
-    Stack*);
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*unused*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);

 // Note [Ambiguity in AutogradOther kernel]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -62,10 +62,10 @@ TORCH_API void fallthrough_kernel(
 // than arbitrarily pick one or the other, we just register a kernel that raises
 // an error and let the user decide how to proceed.
 TORCH_API void ambiguous_autogradother_kernel(
-    OperatorKernel*,
-    const OperatorHandle&,
-    DispatchKeySet,
-    Stack*);
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*op*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);

 // Note [named_not_supported_kernel]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -75,10 +75,10 @@ TORCH_API void ambiguous_autogradother_kernel(
 // give a good error message in cases when boxing is not supported).  When
 // boxing is universally supported this can be removed.
 [[noreturn]] TORCH_API void named_not_supported_kernel(
-    OperatorKernel*,
-    const OperatorHandle&,
-    DispatchKeySet,
-    Stack*);
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*op*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);

 /**
 * BoxedKernel is similar to a std::function storing a boxed kernel.
@ -185,16 +185,16 @@ class TORCH_API BoxedKernel final {

  template <BoxedKernelFunction* func>
  static void make_boxed_function(
-      OperatorKernel*,
+      OperatorKernel* /*unused*/,
      const OperatorHandle& opHandle,
-      DispatchKeySet,
+      DispatchKeySet /*unused*/,
      Stack* stack);

  template <BoxedKernelFunction_withDispatchKeys* func>
  static void make_boxed_function(
-      OperatorKernel*,
+      OperatorKernel* /*unused*/,
      const OperatorHandle& opHandle,
-      DispatchKeySet,
+      DispatchKeySet /*ks*/,
      Stack* stack);

  explicit BoxedKernel(
--- a/aten/src/ATen/core/boxing/BoxedKernel_impl.h
+++ b/aten/src/ATen/core/boxing/BoxedKernel_impl.h
@ -11,9 +11,9 @@ inline BoxedKernel::BoxedKernel(

 template <BoxedKernel::BoxedKernelFunction* func>
 inline void BoxedKernel::make_boxed_function(
-    OperatorKernel*,
+    OperatorKernel* /*unused*/,
    const OperatorHandle& opHandle,
-    DispatchKeySet,
+    DispatchKeySet /*unused*/,
    Stack* stack) {
  // Note that we're dropping the DispatchKeySet argument.
  // See Note [Plumbing Keys Through The Dispatcher 2] for details.
@ -22,7 +22,7 @@ inline void BoxedKernel::make_boxed_function(

 template <BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
 inline void BoxedKernel::make_boxed_function(
-    OperatorKernel*,
+    OperatorKernel* /*unused*/,
    const OperatorHandle& opHandle,
    DispatchKeySet ks,
    Stack* stack) {
--- a/aten/src/ATen/core/boxing/KernelFunction.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction.cpp
@ -10,7 +10,7 @@ namespace c10 {
 // be handled specially.  Its semantics is that it redispatches to the
 // *next* dispatch key that would have been processed, skipping the current
 // one.
-void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*) {
+void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unused*/, DispatchKeySet /*unused*/, Stack* /*unused*/) {
  TORCH_INTERNAL_ASSERT(0,
    "fallthrough_kernel was executed but it should have been short-circuited by the dispatcher. "
    "This could occur if you registered a fallthrough kernel as a override for a specific operator "
@ -19,7 +19,7 @@ void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet,
    "let us know in the bug tracker.");
 }

-void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
+void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
  TORCH_INTERNAL_ASSERT(0,
    op.operator_name(), " has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. "
    "This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering "
@ -32,7 +32,7 @@ void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, D
    "\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n");
 }

-void named_not_supported_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
+void named_not_supported_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
  // DO NOT LOOK AT STACK, YOU HAVE SHORT CIRCUITED BOXING
  // See Note [named_not_supported_kernel]
  TORCH_CHECK(0,
--- a/aten/src/ATen/core/boxing/KernelFunction.h
+++ b/aten/src/ATen/core/boxing/KernelFunction.h
@ -229,7 +229,7 @@ class TORCH_API KernelFunction final {
   * &unboxed_func>();
   */
  template <class FuncPtr, bool AllowLegacyTypes = false>
-  static KernelFunction makeFromUnboxedFunction(FuncPtr);
+  static KernelFunction makeFromUnboxedFunction(FuncPtr /*func_ptr*/);

  /**
   * Create a KernelFunction from an unboxed function.
@ -271,7 +271,7 @@ class TORCH_API KernelFunction final {

  std::string dumpState() const;
  // For testing internal invariants only
-  bool _equalsBoxedAndUnboxed(const KernelFunction&) const;
+  bool _equalsBoxedAndUnboxed(const KernelFunction& /*other*/) const;

  // Register a token to be invalidated when this KernelFunction is destroyed
  void registerToken(std::weak_ptr<KernelToken> token) const;
--- a/aten/src/ATen/core/boxing/impl/boxing.h
+++ b/aten/src/ATen/core/boxing/impl/boxing.h
@ -131,7 +131,7 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(
  new (dest++) IValue(options.pinned_memory());
 }

-inline void boxArgsToStack(IValue*&) {}
+inline void boxArgsToStack(IValue*& /*unused*/) {}

 template <typename T, typename... Args>
 C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack(
@ -185,7 +185,7 @@ struct PopResult<std::tuple<Types...>> final {
  template <size_t... indices>
  static Result pop_to_tuple_impl(
      Stack& stack,
-      std::index_sequence<indices...>) {
+      std::index_sequence<indices...> /*unused*/) {
    return std::make_tuple((std::move(stack[indices]).template to<Types>())...);
  }
 };
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@ -561,7 +561,7 @@ struct wrap_kernel_functor_unboxed_<
  // doesn't use &&
  static ReturnType call(
      OperatorKernel* functor,
-      DispatchKeySet,
+      DispatchKeySet /*unused*/,
      ParameterTypes... args) {
    KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
    // Note [Plumbing Keys Through The Dispatcher 2]
@ -629,8 +629,8 @@ call_functor_with_args_from_stack_(
    OperatorKernel* functor,
    DispatchKeySet dispatchKeySet,
    Stack* stack,
-    std::index_sequence<ivalue_arg_indices...>,
-    guts::typelist::typelist<ArgTypes...>*) {
+    std::index_sequence<ivalue_arg_indices...> /*unused*/,
+    guts::typelist::typelist<ArgTypes...>* /*unused*/) {
  (void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would
                 // be unused and we have to silence the compiler warning.

@ -708,7 +708,7 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
  static void call_(
      std::tuple<OutputTypes...>&& output,
      Stack* stack,
-      std::index_sequence<indices...>) {
+      std::index_sequence<indices...> /*unused*/) {
    torch::jit::push(
        *stack,
        return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::call(
@ -718,7 +718,7 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
  static void copy_(
      const std::tuple<OutputTypes...>& output,
      Stack* stack,
-      std::index_sequence<indices...>) {
+      std::index_sequence<indices...> /*unused*/) {
    torch::jit::push(
        *stack,
        return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::copy(
@ -741,7 +741,7 @@ struct make_boxed_from_unboxed_functor final {

  static void call(
      OperatorKernel* functor,
-      const OperatorHandle&,
+      const OperatorHandle& /*unused*/,
      DispatchKeySet dispatchKeySet,
      Stack* stack) {
    using ReturnType =
--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@ -63,13 +63,13 @@ struct BuiltinOpFunction : public Function {

  bool call(
      Stack& stack,
-      std::optional<size_t>,
-      c10::function_ref<void(const Code&)>) override {
+      std::optional<size_t> /*unused*/,
+      c10::function_ref<void(const Code&)> /*unused*/) override {
    run(stack);
    return false;
  }

-  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>)
+  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)> /*unused*/)
      override {
    run(stack);
    return false;
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@ -80,7 +80,8 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
      ts = ts | x.key_set();
    }
  }
-  [[noreturn]] void operator()(at::ArrayRef<std::optional<at::Tensor>>) {
+  [[noreturn]] void operator()(
+      at::ArrayRef<std::optional<at::Tensor>> /*unused*/) {
    // Just checking that the handling of Tensor?[] didn't change.
    TORCH_INTERNAL_ASSERT(false);
  }
@ -95,7 +96,7 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
    }
  }
  template <typename T>
-  void operator()(const T&) {
+  void operator()(const T& /*unused*/) {
    // do nothing
  }
 };
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -633,7 +633,7 @@ class TypedOperatorHandle<Return(Args...)> final : public OperatorHandle {

 namespace detail {
 template <class... Args>
-inline void unused_arg_(const Args&...) {}
+inline void unused_arg_(const Args&... /*unused*/) {}

 // CaptureKernelCall is intended to capture return values from Dispatcher
 // unboxed kernel calls. A record function may request to get outputs from the
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@ -105,7 +105,7 @@ class TORCH_API OperatorEntry final {
  // versa that is an error.  (Refcounting for the registrations is
  // handled in the OperatorHandle in Dispatcher)
  void registerSchema(
-      FunctionSchema&&,
+      FunctionSchema&& /*schema*/,
      std::string&& debug,
      std::vector<at::Tag> tags = {});
  void deregisterSchema();
--- a/aten/src/ATen/core/dynamic_type.cpp
+++ b/aten/src/ATen/core/dynamic_type.cpp
@ -177,7 +177,7 @@ bool DynamicType::equals(const Type& rhs) const {
  return equals(*create(rhs));
 }

-bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream*) const {
+bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream* /*why_not*/) const {
  auto other = create(rhs);
  if (tag_ == other->tag_) {
    if (equals(*other)) {
@ -371,7 +371,7 @@ DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::create(
 }

 DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::fallback(
-    const Type&) {
+    const Type& /*unused*/) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
  return nullptr;
 }
--- a/aten/src/ATen/core/dynamic_type.h
+++ b/aten/src/ATen/core/dynamic_type.h
@ -138,8 +138,8 @@ class DynamicType : public SharedType {

  struct Arguments {
    Arguments() = default;
-    Arguments(c10::ArrayRef<TypePtr>);
-    Arguments(const std::vector<std::string_view>&, c10::ArrayRef<TypePtr>);
+    Arguments(c10::ArrayRef<TypePtr> /*args*/);
+    Arguments(const std::vector<std::string_view>& /*names*/, c10::ArrayRef<TypePtr> /*args*/);
    std::vector<LabeledDynamicType> elems;
  };

@ -156,15 +156,15 @@ class DynamicType : public SharedType {
  static const TypeKind Kind = TypeKind::DynamicType;
  static TORCH_API DynamicTypePtr create(Type& ty);

-  explicit DynamicType(Tag, Arguments);
-  explicit DynamicType(Tag, std::string_view, Arguments);
+  explicit DynamicType(Tag /*tag*/, Arguments /*arguments*/);
+  explicit DynamicType(Tag /*tag*/, std::string_view /*name*/, Arguments /*arguments*/);

  DynamicType(DynamicType&& other) = delete;
  DynamicType(const DynamicType&) = delete;
  DynamicType& operator=(const DynamicType&) = delete;
  DynamicType& operator=(DynamicType&&) = delete;

-  TypePtr containedType(size_t) const override;
+  TypePtr containedType(size_t /*i*/) const override;
  size_t containedTypeSize() const override;
  Tag tag() const {
    return tag_;
--- a/aten/src/ATen/core/function.h
+++ b/aten/src/ATen/core/function.h
@ -96,15 +96,15 @@ struct TORCH_API Function {
  // Overload for server interpreter, a bailout size is needed for graph
  // executor.
  virtual bool call(
-      Stack&,
-      std::optional<size_t>,
-      c10::function_ref<void(const Code&)>) {
+      Stack& /*unused*/,
+      std::optional<size_t> /*unused*/,
+      c10::function_ref<void(const Code&)> /*unused*/) {
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
    return false;
  }

  // Overload for mobile interpreter.
-  virtual bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) {
+  virtual bool call(Stack& /*unused*/, c10::function_ref<void(const mobile::Code&)> /*unused*/) {
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
    return false;
  }
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@ -847,7 +847,7 @@ struct TORCH_API IValue final {
  IValue(std::optional<T> v);
  template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
  IValue(c10::OptionalArrayRef<T> v);
-  IValue(std::nullopt_t);
+  IValue(std::nullopt_t /*unused*/);

  // ClassType
  IValue(c10::intrusive_ptr<ivalue::Object> v);
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@ -660,7 +660,7 @@ struct TORCH_API TupleTypeFactory<TupleType> {
 template <>
 struct TORCH_API TupleTypeFactory<c10::DynamicType> {
  static DynamicTypePtr create(const std::vector<TypePtr>& elemTypes);
-  static DynamicTypePtr fallback(const Type&);
+  static DynamicTypePtr fallback(const Type& /*unused*/);
 };

 struct TORCH_API Tuple : c10::intrusive_ptr_target {
@ -1682,7 +1682,7 @@ struct ivalue::EnumHolder : c10::intrusive_ptr_target {
 namespace detail {

 struct _guarded_unsigned_long_unique_dummy final {
-  _guarded_unsigned_long_unique_dummy(int64_t){}
+  _guarded_unsigned_long_unique_dummy(int64_t /*unused*/){}
 };
 using _guarded_unsigned_long = std::conditional_t<
    std::is_same_v<unsigned long, uint32_t> ||
@ -1776,7 +1776,7 @@ template <class Elem>
 // native_functions.yaml still return std::vector.
 // C10_DEPRECATED_MESSAGE("IValues based on std::vector<T> are potentially slow
 // and deprecated. Please use torch::List<T> instead.")
-std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>>) {
+std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>> /*unused*/) {
  // We need to do a deep copy of the vector because there might be other
  // references to this same IValue that also use the list. We can't just
  // move the elements out.
@ -1826,18 +1826,18 @@ c10::intrusive_ptr<T> IValue::toCustomClass() const& {
 }

 template <typename T>
-T generic_to(IValue ivalue, _fake_type<T>) {
+T generic_to(IValue ivalue, _fake_type<T> /*unused*/) {
  using ElemType = typename std::remove_pointer<T>::type::element_type;
  return std::move(ivalue).template toCustomClass<ElemType>();
 }

 template <typename T>
-tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>>) {
+tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>> /*unused*/) {
  return tagged_capsule<T>{std::move(ivalue)};
 }

 template <typename Elem>
-c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>>) {
+c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>> /*unused*/) {
  return impl::toTypedList<Elem>(std::move(ivalue).toList());
 }

@ -1867,7 +1867,7 @@ std::vector<T> createVectorFromList(const c10::List<T>& impl) {
 }

 template <typename T>
-OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>>) {
+OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>> /*unused*/) {
  if (ivalue.isNone()) {
    return {};
  }
@ -1880,8 +1880,8 @@ namespace detail {
 template <typename Elem, size_t... I>
 std::array<Elem, sizeof...(I)> generic_to_array(
    IValue ivalue,
-    _fake_type<std::array<Elem, sizeof...(I)>>,
-    std::index_sequence<I...>) {
+    _fake_type<std::array<Elem, sizeof...(I)>> /*unused*/,
+    std::index_sequence<I...> /*unused*/) {
  // We need to do a deep copy of the array because there might be other
  // references to this same IValue that also use the list. We can't just
  // move the elements out.
@ -1906,7 +1906,7 @@ std::array<Elem, N> generic_to(
 template <typename Key, typename Value>
 c10::Dict<Key, Value> generic_to(
    IValue ivalue,
-    _fake_type<c10::Dict<Key, Value>>) {
+    _fake_type<c10::Dict<Key, Value>> /*unused*/) {
  return impl::toTypedDict<Key, Value>(std::move(ivalue).toGenericDict());
 }

@ -1915,7 +1915,7 @@ C10_DEPRECATED_MESSAGE(
    "IValues based on std::unordered_map are slow and deprecated. Please use c10::Dict<K, V> instead.")
 std::unordered_map<K, V> generic_to(
    IValue ivalue,
-    _fake_type<std::unordered_map<K, V>>) {
+    _fake_type<std::unordered_map<K, V>> /*unused*/) {
  std::unordered_map<K, V> specialized_dict;

  for (const auto& item : std::move(ivalue).toGenericDict()) {
@ -1926,7 +1926,7 @@ std::unordered_map<K, V> generic_to(
 }

 template <typename T>
-std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>>) {
+std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>> /*unused*/) {
  if (ivalue.isNone()) {
    return std::nullopt;
  }
@ -1937,7 +1937,7 @@ namespace detail {
 template <typename Tuple, std::size_t... INDEX>
 Tuple generic_to_tuple_impl(
    const ivalue::TupleElements& t,
-    std::index_sequence<INDEX...>) {
+    std::index_sequence<INDEX...> /*unused*/) {
  return std::make_tuple(
      t[INDEX].to<typename std::tuple_element<INDEX, Tuple>::type>()...);
 }
@ -1951,7 +1951,7 @@ template <
            std::is_lvalue_reference<Args>...,
            std::negation<std::is_constructible<IValue, Args>>...>,
        std::nullptr_t> = nullptr>
-std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>>) {
+std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>> /*unused*/) {
  const auto& vals = ivalue.toTupleRef().elements();
  TORCH_CHECK(vals.size() == sizeof...(Args));
  return detail::generic_to_tuple_impl<std::tuple<Args...>>(vals, Indices{});
@ -2311,7 +2311,7 @@ inline IValue::IValue(std::optional<T> v) : IValue() {
  }
 }

-inline IValue::IValue(std::nullopt_t) : IValue() {}
+inline IValue::IValue(std::nullopt_t /*unused*/) : IValue() {}

 inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
    : tag(Tag::Object) {
@ -2482,15 +2482,15 @@ namespace ivalue {
 namespace detail {

 template <typename T>
-IValue from_(T&& x, std::true_type) {
+IValue from_(T&& x, std::true_type /*unused*/) {
  return IValue(std::forward<T>(x));
 }
 template <typename T>
-IValue from_(c10::intrusive_ptr<T> x, std::false_type) {
+IValue from_(c10::intrusive_ptr<T> x, std::false_type /*unused*/) {
  return IValue(std::move(x));
 }
 template <typename T>
-IValue from_(T&& /*x*/, std::false_type) {
+IValue from_(T&& /*x*/, std::false_type /*unused*/) {
  static_assert(
      guts::false_t<T>::value,
      "You are calling from with a type that it doesn't support, and isn't a potential custom class (ie: is an intrusive_ptr)");
@ -2546,19 +2546,19 @@ struct MaybeOwnedTraits<IValue> {
    return &borrow;
  }

-  static bool debugBorrowIsValid(const borrow_type&) {
+  static bool debugBorrowIsValid(const borrow_type& /*unused*/) {
    return true;
  }
 };

 template <>
 struct IValue::TagType<c10::Type> {
-  static TORCH_API c10::TypePtr get(const IValue&);
+  static TORCH_API c10::TypePtr get(const IValue& /*v*/);
 };

 template <>
 struct IValue::TagType<c10::DynamicType> {
-  static TORCH_API c10::TypePtr get(const IValue&);
+  static TORCH_API c10::TypePtr get(const IValue& /*v*/);
 };

 template <typename T>
--- a/aten/src/ATen/core/op_registration/infer_schema.h
+++ b/aten/src/ATen/core/op_registration/infer_schema.h
@ -44,7 +44,7 @@ constexpr int checkStaticTypes() {
 }

 template <typename... Ts, size_t... Is>
-constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...>) {
+constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...> /*unused*/) {
  return (
    // Check types for common errors
    checkStaticTypes<Ts...>(),
--- a/aten/src/ATen/core/operator_name.h
+++ b/aten/src/ATen/core/operator_name.h
@ -83,7 +83,7 @@ inline bool operator!=(const OperatorName& lhs, const OperatorName& rhs) {
 }

 TORCH_API std::string toString(const OperatorName& opName);
-TORCH_API std::ostream& operator<<(std::ostream&, const OperatorName&);
+TORCH_API std::ostream& operator<<(std::ostream& /*os*/, const OperatorName& /*opName*/);

 } // namespace c10

--- a/aten/src/ATen/core/type_ptr.h
+++ b/aten/src/ATen/core/type_ptr.h
@ -16,7 +16,7 @@ class SingletonTypePtr {
  /* implicit */ SingletonTypePtr(T* p) : repr_(p) {}

  // We need this to satisfy Pybind11, but it shouldn't be hit.
-  explicit SingletonTypePtr(std::shared_ptr<T>) { TORCH_CHECK(false); }
+  explicit SingletonTypePtr(std::shared_ptr<T> /*unused*/) { TORCH_CHECK(false); }

  using element_type = typename std::shared_ptr<T>::element_type;

--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@ -102,31 +102,8 @@ struct VecReduceAllSIMD<float, Op> {
 #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
       // !defined(C10_MOBILE)

-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
-#if defined(CPU_CAPABILITY_SVE256)
-template <typename Op>
-struct VecReduceAllSIMD<float, Op> {
-  static inline float apply(
-      const Op& vec_fun,
-      const Vectorized<float>& acc_vec) {
-    using Vec = Vectorized<float>;
-    Vec v = acc_vec;
-    // 128-bit shuffle
-    svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
-    Vec v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    // 64-bit shuffle
-    ind = svdupq_n_u32(2, 3, 0, 1);
-    v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    // 32-bit shuffle
-    ind = svdupq_n_u32(1, 0, 2, 3);
-    v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    return svlasta(svpfalse(), v);
-  }
-};
-#else
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
+    !defined(CPU_CAPABILITY_SVE)
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
  static inline float apply(
@ -163,8 +140,35 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
    return vaddvq_f32(acc_vec);
  }
 };
-#endif // defined(CPU_CAPABILITY_SVE256)
 #endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+       // && !defined(CPU_CAPABILITY_SVE)
+
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
+    defined(CPU_CAPABILITY_SVE256)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(
+      const Op& vec_fun,
+      const Vectorized<float>& acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+    // 128-bit shuffle
+    svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
+    Vec v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    // 64-bit shuffle
+    ind = svdupq_n_u32(2, 3, 0, 1);
+    v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    // 32-bit shuffle
+    ind = svdupq_n_u32(1, 0, 2, 3);
+    v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    return svlasta(svpfalse(), v);
+  }
+};
+#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+       // && defined(CPU_CAPABILITY_SVE256)

 template <typename scalar_t, typename Op>
 inline scalar_t vec_reduce_all(
--- a/aten/src/ATen/cpu/vec/sve/sve_helper.h
+++ b/aten/src/ATen/cpu/vec/sve/sve_helper.h
@ -1,21 +1,9 @@
 #pragma once

 #include <ATen/cpu/vec/intrinsics.h>
-#include <c10/macros/Macros.h>
-#include <cstdint>

 #include <ATen/cpu/vec/vec_base.h>

-#if defined(__aarch64__) &&                     \
-    (defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) || \
-     defined(AT_BUILD_ARM_VECSVE_WITH_SLEEF))
-#define SLEEF_STATIC_LIBS
-#include <sleef.h>
-#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
-#else
-#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
-#endif
-
 #if defined(CPU_CAPABILITY_SVE)

 // Define the data type of VLS(vector-length specific).
--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
@ -2,6 +2,7 @@

 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/sve/sve_helper.h>
+#include <ATen/cpu/vec/sve/vec_common_sve.h>
 #include <ATen/cpu/vec/sve/vec_float.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/bit_cast.h>
@ -307,8 +308,8 @@ Vectorized<c10::BFloat16> inline operator/(
 }

 inline Vectorized<BFloat16>::Vectorized() {
-  const short zero = 0;
-  values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero));
+  auto vals_f = svdup_n_f32(0);
+  values = convert_float_bfloat16(vals_f, vals_f);
 }

 inline Vectorized<BFloat16>::Vectorized(int val) {
--- a/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
@ -8,48 +8,13 @@
 #include <ATen/cpu/vec/sve/sve_helper.h>
 #include <ATen/cpu/vec/vec_base.h>

-#ifdef CPU_CAPABILITY_SVE128
-
-#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
-
-#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
-
-#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
-
-#include <ATen/cpu/vec/vec128/vec128_convert.h>
-
-#include <ATen/cpu/vec/sve/vec_qint.h>
-
-#elif defined(CPU_CAPABILITY_SVE)
-
-#include <ATen/cpu/vec/sve/vec_float.h>
-
+#if defined(CPU_CAPABILITY_SVE)
 #include <ATen/cpu/vec/sve/vec_bfloat16.h>
-
 #include <ATen/cpu/vec/sve/vec_double.h>
+#include <ATen/cpu/vec/sve/vec_float.h>
 #include <ATen/cpu/vec/sve/vec_int.h>
-
 #include <ATen/cpu/vec/sve/vec_qint.h>
-
-#include <ATen/cpu/vec/vec256/vec256_half.h>
-
-#include <ATen/cpu/vec/vec256/vec256_convert.h>
-
-#else // NEON
-
-#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
-
-#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
-
-#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
-
-#include <ATen/cpu/vec/vec128/vec128_convert.h>
-
-#include <ATen/cpu/vec/vec256/vec256_qint.h>
-
-#endif // defined(CPU_CAPABILITY_SVE128)
-
-#include <ATen/cpu/vec/functional.h>
+#endif

 namespace at::vec {
 // Note [CPU_CAPABILITY namespace]
@ -83,6 +48,12 @@ DEFINE_SVE_CAST(int32_t, s32, float, f32)
 DEFINE_SVE_CAST(int16_t, s16, float, f32)
 DEFINE_SVE_CAST(float, f32, double, f64)

+#ifdef __ARM_FEATURE_BF16
+DEFINE_SVE_CAST(int64_t, s64, c10::BFloat16, bf16)
+DEFINE_SVE_CAST(int32_t, s32, c10::BFloat16, bf16)
+DEFINE_SVE_CAST(int16_t, s16, c10::BFloat16, bf16)
+#endif // __ARM_FEATURE_BF16
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 template <int64_t scale = 1>
@ -202,11 +173,9 @@ std::pair<
  // group cols crossing lanes:
  //   return {a0, b0, a1, b1, a2, b2, a3, b3}
  //          {a4, b4, a5, b5, a6, b6, a7, b7}
-  svbfloat16_t aReg = a;
-  svbfloat16_t bReg = b;
-  Vectorized<c10::BFloat16> c = svzip1_bf16(aReg, bReg);
-  Vectorized<c10::BFloat16> d = svzip2_bf16(aReg, bReg);
-  return std::make_pair(c, d);
+  return std::make_pair(
+      Vectorized<c10::BFloat16>(svzip1_bf16(a, b)),
+      Vectorized<c10::BFloat16>(svzip2_bf16(a, b)));
 }
 #endif // __ARM_FEATURE_BF16

@ -255,27 +224,12 @@ std::pair<
  // swap lanes:
  //   return {a0, a1, a2, a3, a4, a5, a6, a7}
  //          {b0, b1, b2, b3, b4, b5, b6, b7}
-  svbfloat16_t aReg = a;
-  svbfloat16_t bReg = b;
-  Vectorized<c10::BFloat16> c = svuzp1_bf16(aReg, bReg);
-  Vectorized<c10::BFloat16> d = svuzp2_bf16(aReg, bReg);
-  return std::make_pair(c, d);
+  return std::make_pair(
+      Vectorized<c10::BFloat16>(svuzp1_bf16((svbfloat16_t)a, (svbfloat16_t)b)),
+      Vectorized<c10::BFloat16>(svuzp2_bf16((svbfloat16_t)a, (svbfloat16_t)b)));
 }
 #endif // __ARM_FEATURE_BF16

-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#define DEFINE_FLIP_FUNC(type, sve_func)                    \
-  inline Vectorized<type> flip(const Vectorized<type>& v) { \
-    return Vectorized<type>(sve_func(v));                   \
-  }
-// Use the macro to define the flip functions
-DEFINE_FLIP_FUNC(float, svrev_f32)
-DEFINE_FLIP_FUNC(double, svrev_f64)
-DEFINE_FLIP_FUNC(int64_t, svrev_s64)
-DEFINE_FLIP_FUNC(int32_t, svrev_s32)
-DEFINE_FLIP_FUNC(int16_t, svrev_s16)
-DEFINE_FLIP_FUNC(int8_t, svrev_s8)
-
 #endif // defined(CPU_CAPABILITY_SVE)

 } // namespace CPU_CAPABILITY
--- a/aten/src/ATen/cpu/vec/vec.h
+++ b/aten/src/ATen/cpu/vec/vec.h
@ -1,8 +1,6 @@
 #pragma once

-#if defined(__aarch64__)
-#include <ATen/cpu/vec/vec_common_aarch64.h>
-#elif defined(CPU_CAPABILITY_AVX512)
+#if defined(CPU_CAPABILITY_AVX512)
 #include <ATen/cpu/vec/vec512/vec512.h>
 #else
 #include <ATen/cpu/vec/vec128/vec128.h>
@ -13,34 +11,6 @@ namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

-inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
-  stream << val.val_;
-  return stream;
-}
-inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
-  stream << static_cast<int>(val.val_);
-  return stream;
-}
-inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
-  stream << static_cast<unsigned int>(val.val_);
-  return stream;
-}
-
-template <typename T>
-std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
-  T buf[Vectorized<T>::size()];
-  vec.store(buf);
-  stream << "vec[";
-  for (int i = 0; i != Vectorized<T>::size(); i++) {
-    if (i != 0) {
-      stream << ", ";
-    }
-    stream << buf[i];
-  }
-  stream << "]";
-  return stream;
-}
-
 inline Vectorized<bool> convert_to_bool(Vectorized<int8_t> x) {
  __at_align__ bool buffer[x.size()];
  x.ne(Vectorized<int8_t>(0)).store(buffer);
--- a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
@ -2,7 +2,6 @@

 // DO NOT DEFINE STATIC DATA IN THIS HEADER!
 // See Note [Do not compile initializers with AVX]
-#include <ATen/cpu/vec/sve/sve_helper.h>
 #include <ATen/cpu/vec/vec128/vec128_float_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
 #include <ATen/cpu/vec/vec_base.h>
@ -263,13 +262,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
            c10::bit_cast<at_bfloat16_t>(val6.x),
            c10::bit_cast<at_bfloat16_t>(val7.x)}) {}

-#ifdef CPU_CAPABILITY_SVE128
-  Vectorized(svbfloat16_t v) : Vectorized16(svget_neonq(v)) {}
-  operator svbfloat16_t() const {
-    return svset_neonq(svundef_bf16(), values);
-  }
-#endif
-
  static Vectorized<c10::BFloat16> blendv(
      const Vectorized<c10::BFloat16>& a,
      const Vectorized<c10::BFloat16>& b,
@ -382,23 +374,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
  Vectorized ge(const Vectorized& other) const;
  Vectorized lt(const Vectorized& other) const;
  Vectorized le(const Vectorized& other) const;
-
-#ifdef CPU_CAPABILITY_SVE128
-
-  template <typename step_t>
-  static Vectorized<BFloat16> arange(
-      BFloat16 base = 0.f,
-      step_t step = static_cast<step_t>(1)) {
-    __at_align__ BFloat16 buffer[size()];
-    for (int64_t i = 0; i < size(); i++) {
-      buffer[i] = base + i * step;
-    }
-    return svget_neonq(
-        svld1_bf16(ptrue, reinterpret_cast<bfloat16_t*>(buffer)));
-  }
-
-#endif // CPU_CAPABILITY_SVE128
-
 }; // Vectorized<c10::BFloat16>

 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
@ -422,24 +397,6 @@ inline Vectorized<c10::BFloat16> convert_float_bfloat16(
  return Vectorized<c10::BFloat16>(at_vcombine_bf16(x1, x2));
 }

-inline void load_fp32_from_bf16(const BFloat16* data, Vectorized<float>& out) {
-  __at_align__ float values[Vectorized<float>::size()];
-  for (const auto k : c10::irange(Vectorized<float>::size())) {
-    values[k] = data[k];
-  }
-  out = Vectorized<float>::loadu(values);
-}
-
-inline void load_fp32_from_bf16(
-    const BFloat16* data,
-    Vectorized<float>& out1,
-    Vectorized<float>& out2) {
-  Vectorized<BFloat16> bf16_vec = Vectorized<BFloat16>::loadu(data);
-  auto floats = convert_bfloat16_float(bf16_vec);
-  out1 = std::get<0>(floats);
-  out2 = std::get<1>(floats);
-}
-
 template <typename Op>
 Vectorized<c10::BFloat16> binary_operator_via_float(
    Op op,
@ -622,12 +579,6 @@ Vectorized<c10::BFloat16> inline fnmsub(
  return -a * b - c;
 }

-#else //
-
-CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
-
-LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
-
 #endif // !defined(C10_MOBILE) && defined(__aarch64__)

 } // namespace CPU_CAPABILITY
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@ -4,7 +4,7 @@

 namespace at::vec {
 inline namespace CPU_CAPABILITY {
-#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
+#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
 template <typename src_t>
 struct VecConvert<
    float,
@ -60,7 +60,6 @@ struct VecConvert<float, 1, BFloat16, 1> {
  }
 };

-#endif // defined(__aarch64__) && (!defined(CPU_CAPABILITY_SVE) ||
-       // defined(CPU_CAPABILITY_SVE128))
+#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@ -4,10 +4,13 @@
 // See Note [Do not compile initializers with AVX]

 #include <ATen/cpu/vec/intrinsics.h>
-#include <ATen/cpu/vec/sve/sve_helper.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>

+#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#include <sleef.h>
+#endif
+
 // Sleef offers vectorized versions of some transcedentals
 // such as sin, cos, tan etc..
 // However for now opting for STL, since we are not building
@ -32,6 +35,12 @@ inline namespace CPU_CAPABILITY {
 #error "Big endian is not supported."
 #endif

+#if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+
 template <int index, bool mask_val>
 struct BlendRegs {
  static float32x4_t impl(
@ -85,12 +94,6 @@ class Vectorized<float> {
  operator float32x4_t() const {
    return values;
  }
-#ifdef CPU_CAPABILITY_SVE128
-  Vectorized(svfloat32_t v) : values(svget_neonq(v)) {}
-  operator svfloat32_t() const {
-    return svset_neonq(svundef_f32(), values);
-  }
-#endif
  template <int64_t mask>
  static Vectorized<float> blend(
      const Vectorized<float>& a,
--- a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
@ -4,6 +4,7 @@
 // See Note [Do not compile initializers with AVX]

 #include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec128/vec128_convert.h>
 #include <ATen/cpu/vec/vec128/vec128_float_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
 #include <ATen/cpu/vec/vec_base.h>
@ -24,6 +25,7 @@ inline namespace CPU_CAPABILITY {
 //    https://bugs.llvm.org/show_bug.cgi?id=45824
 // Most likely we will do aarch32 support with inline asm.
 #if !defined(C10_MOBILE) && defined(__aarch64__)
+
 #ifdef __BIG_ENDIAN__
 #error "Big endian is not supported."
 #endif
@ -419,24 +421,6 @@ Vectorized<c10::Half> inline operator+(
 #endif
 }

-inline void load_fp32_from_fp16(const c10::Half* data, Vectorized<float>& out) {
-  __at_align__ float values[Vectorized<float>::size()];
-  for (const auto k : c10::irange(Vectorized<float>::size())) {
-    values[k] = data[k];
-  }
-  out = Vectorized<float>::loadu(values);
-}
-
-inline void load_fp32_from_fp16(
-    const c10::Half* data,
-    Vectorized<float>& out1,
-    Vectorized<float>& out2) {
-  Vectorized<c10::Half> f16_vec = Vectorized<c10::Half>::loadu(data);
-  auto floats = convert_half_float(f16_vec);
-  out1 = std::get<0>(floats);
-  out2 = std::get<1>(floats);
-}
-
 template <>
 Vectorized<c10::Half> inline operator-(
    const Vectorized<c10::Half>& a,
@ -672,53 +656,6 @@ Vectorized<c10::Half> inline fnmsub(
  return -a * b - c;
 #endif
 }
-
-#else
-
-#define CONVERT_NON_VECTORIZED_INIT(type, name)                     \
-  inline std::tuple<Vectorized<float>, Vectorized<float>>           \
-      convert_##name##_float(const Vectorized<type>& a) {           \
-    constexpr int64_t K = Vectorized<type>::size();                 \
-    __at_align__ float arr[K];                                      \
-    __at_align__ type arr2[K];                                      \
-    a.store(arr2);                                                  \
-    convert(arr2, arr, K);                                          \
-    return std::make_tuple(                                         \
-        Vectorized<float>::loadu(arr),                              \
-        Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
-  }                                                                 \
-  inline Vectorized<type> convert_float_##name(                     \
-      const Vectorized<float>& a, const Vectorized<float>& b) {     \
-    constexpr int64_t K = Vectorized<type>::size();                 \
-    __at_align__ float arr[K];                                      \
-    __at_align__ type arr2[K];                                      \
-    a.store(arr);                                                   \
-    b.store(arr + Vectorized<float>::size());                       \
-    convert(arr, arr2, K);                                          \
-    return Vectorized<type>::loadu(arr2);                           \
-  }
-
-#define LOAD_FP32_NON_VECTORIZED_INIT(type, name)                           \
-  inline void load_fp32_from_##name(                                        \
-      const type* data, Vectorized<float>& out) {                           \
-    __at_align__ float values[Vectorized<float>::size()];                   \
-    for (const auto k : c10::irange(Vectorized<float>::size())) {           \
-      values[k] = data[k];                                                  \
-    }                                                                       \
-    out = Vectorized<float>::loadu(values);                                 \
-  }                                                                         \
-                                                                            \
-  inline void load_fp32_from_##name(                                        \
-      const type* data, Vectorized<float>& out1, Vectorized<float>& out2) { \
-    load_fp32_from_##name(data, out1);                                      \
-    data += Vectorized<float>::size();                                      \
-    load_fp32_from_##name(data, out2);                                      \
-  }
-
-CONVERT_NON_VECTORIZED_INIT(Half, half)
-
-LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16)
-
 #endif // !defined(C10_MOBILE) && defined(__aarch64__)

 } // namespace CPU_CAPABILITY
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@ -9,16 +9,21 @@
 #if !(                                                 \
    defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || \
    defined(CPU_CAPABILITY_ZVECTOR))
-#include <ATen/cpu/vec/vec256/vec256_double.h>
+#if defined(CPU_CAPABILITY_SVE256)
+#include <ATen/cpu/vec/sve/vec_common_sve.h>
+#else
+// clang-format off
 #include <ATen/cpu/vec/vec256/vec256_float.h>
+#include <ATen/cpu/vec/vec256/vec256_double.h>
 #include <ATen/cpu/vec/vec256/vec256_int.h>
 #include <ATen/cpu/vec/vec256/vec256_qint.h>
+#endif
 #if !defined(CPU_CAPABILITY_SVE256) || !defined(__ARM_FEATURE_BF16)
 #include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
 #endif
-#include <ATen/cpu/vec/vec256/vec256_complex_double.h>
-#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
 #include <ATen/cpu/vec/vec256/vec256_half.h>
+#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
+#include <ATen/cpu/vec/vec256/vec256_complex_double.h>
 // clang-format on
 #elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX)
 #include <ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h>
@ -51,6 +56,34 @@ namespace at::vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

+inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
+  stream << val.val_;
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
+  stream << static_cast<int>(val.val_);
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
+  stream << static_cast<unsigned int>(val.val_);
+  return stream;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
+  T buf[Vectorized<T>::size()];
+  vec.store(buf);
+  stream << "vec[";
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    if (i != 0) {
+      stream << ", ";
+    }
+    stream << buf[i];
+  }
+  stream << "]";
+  return stream;
+}
+
 #if defined(CPU_CAPABILITY_AVX2)

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@ -268,7 +268,9 @@ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)

 #else // defined(CPU_CAPABILITY_AVX2)

-#if !(defined(__aarch64__))
+#if !(                                                                      \
+    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
+    !defined(CPU_CAPABILITY_SVE256))
 CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
 #endif

--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
@ -342,19 +342,19 @@ class Vectorized<c10::complex<double>> {
    return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ);
  }
  Vectorized<c10::complex<double>> operator<(
-      const Vectorized<c10::complex<double>>&) const {
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
    TORCH_CHECK(false, "not supported for complex numbers");
  }
  Vectorized<c10::complex<double>> operator<=(
-      const Vectorized<c10::complex<double>>&) const {
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
    TORCH_CHECK(false, "not supported for complex numbers");
  }
  Vectorized<c10::complex<double>> operator>(
-      const Vectorized<c10::complex<double>>&) const {
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
    TORCH_CHECK(false, "not supported for complex numbers");
  }
  Vectorized<c10::complex<double>> operator>=(
-      const Vectorized<c10::complex<double>>&) const {
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
    TORCH_CHECK(false, "not supported for complex numbers");
  }

--- a/aten/src/ATen/cpu/vec/vec256/vec256_half.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_half.h
@ -268,7 +268,9 @@ LOAD_FP32_VECTORIZED_INIT(Half, fp16)

 #else // defined(CPU_CAPABILITY_AVX2)

-#if !defined(__aarch64__) || defined(CPU_CAPABILITY_SVE256)
+#if !(                                                                      \
+    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
+    !defined(CPU_CAPABILITY_SVE256))
 CONVERT_NON_VECTORIZED_INIT(Half, half)
 #endif

--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@ -5,13 +5,6 @@

 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
-
-#ifdef __aarch64__
-#if defined(CPU_CAPABILITY_SVE128) || !defined(CPU_CAPABILITY_SVE)
-#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
-#endif
-#endif
-
 #include <ATen/native/quantized/AffineQuantizerBase.h>

 #include <c10/util/irange.h>
@ -922,7 +915,7 @@ Vectorized<c10::quint8> inline maximum(
  return a.maximum(b);
 }

-#else
+#elif !defined(CPU_CAPABILITY_SVE256)

 // NOTE: These are low-performance implementations that we fall back on
 // if we are not building with AVX2. This may not be an issue, because
@ -1379,18 +1372,12 @@ Vectorized<c10::quint8> inline maximum(
  return a.maximum(b);
 }

-#if defined(__aarch64__) && \
-    (defined(CPU_CAPABILITY_SVE128) || !defined(CPU_CAPABILITY_SVE))
+#endif // if defined(CPU_CAPABILITY_AVX2)
+
+#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
 std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
    at::vec::Vectorized<int8_t> src) {
-
-#ifdef CPU_CAPABILITY_SVE
-  svint8_t x = src;
-  auto s8x8 = vget_low_s8(svget_neonq(x));
-#else
  auto s8x8 = vld1_s8(src.operator const int8_t*());
-#endif
-
  auto s16x8 = vmovl_s8(s8x8);

  auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
@ -1415,14 +1402,7 @@ std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(

 Vectorized<float> inline convert_int8_half_register_to_float(
    at::vec::Vectorized<int8_t> src) {
-
-#ifdef CPU_CAPABILITY_SVE
-  svint8_t x = src;
-  auto s8x8 = vget_low_s8(svget_neonq(x));
-#else
  auto s8x8 = vld1_s8(src.operator const int8_t*());
-#endif
-
  auto s16x8 = vmovl_s8(s8x8);

  auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
@ -1440,8 +1420,5 @@ Vectorized<float> inline convert_int8_half_register_to_float(
 }

 #endif
-
-#endif // if defined(CPU_CAPABILITY_AVX2)
-
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@ -31,6 +31,34 @@ namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

+inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
+  stream << val.val_;
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
+  stream << static_cast<int>(val.val_);
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
+  stream << static_cast<unsigned int>(val.val_);
+  return stream;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
+  T buf[Vectorized<T>::size()];
+  vec.store(buf);
+  stream << "vec[";
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    if (i != 0) {
+      stream << ", ";
+    }
+    stream << buf[i];
+  }
+  stream << "]";
+  return stream;
+}
+
 #if defined(CPU_CAPABILITY_AVX512)

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512)
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -67,7 +67,18 @@ Windows llvm will not have this definition.
 #endif
 #define VECTOR_WIDTH 64
 #define int_vector __m512i
-#elif defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_SVE256)
+#elif defined(__aarch64__) && \
+    !defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
+// SVE code expects 256-vectors; leave that set for SVE?
+#if defined(__GNUC__)
+#define __at_align__ __attribute__((aligned(16)))
+#elif defined(_WIN32)
+#define __at_align__ __declspec(align(16))
+#else
+#define __at_align__
+#endif
+#define VECTOR_WIDTH 16
+#else // CPU_CAPABILITY_AVX512
 #if defined(__GNUC__)
 #define __at_align__ __attribute__((aligned(32)))
 #elif defined(_WIN32)
@ -77,27 +88,7 @@ Windows llvm will not have this definition.
 #endif
 #define VECTOR_WIDTH 32
 #define int_vector __m256i
-#elif defined(__aarch64__)
-// Define alignment and vector width for SVE128/Default (e.g., NEON)
-#if defined(__GNUC__)
-#define __at_align__ __attribute__((aligned(16)))
-#elif defined(_WIN32)
-#define __at_align__ __declspec(align(16))
-#else
-#define __at_align__
-#endif
-#define VECTOR_WIDTH 16
-#else
-// Fallback: define default alignment and vector width
-#if defined(__GNUC__)
-#define __at_align__ __attribute__((aligned(32)))
-#elif defined(_WIN32)
-#define __at_align__ __declspec(align(32))
-#else
-#define __at_align__
-#endif
-#define VECTOR_WIDTH 32
-#endif
+#endif // CPU_CAPABILITY_AVX512

 namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1861,6 +1861,8 @@ template bool gemm_and_bias(
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation);

+using at::blas::ScalingType;
+
 int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fast_accum) {
  switch (scaling_type) {
    case ScalingType::BlockWise1x32:
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@ -14,6 +14,7 @@
 */

 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/BlasBackend.h>
 #include <ATen/OpMathType.h>

 namespace at::cuda::blas {
@ -136,15 +137,6 @@ void int8_gemm(
    int32_t* result_ptr,
    int64_t result_ld);

-enum class ScalingType : std::uint8_t {
-  TensorWise,  // fp32 scales
-  RowWise,  // fp32 scales
-  BlockWise1x16,  // fp8_e4m3fn scales
-  BlockWise1x32,  // fp8_e8m0fnu scales
-  BlockWise1x128,  // fp32 scales
-  BlockWise128x128,  // fp32 scales
-};
-
 void scaled_gemm(
    char transa,
    char transb,
@ -156,13 +148,13 @@ void scaled_gemm(
    int64_t mat1_ld,
    ScalarType mat1_dtype,
    ScalarType mat1_scale_dtype,
-    ScalingType mat1_scaling_type,
+    at::blas::ScalingType mat1_scaling_type,
    const void* mat2_ptr,
    const void* mat2_scale_ptr,
    int64_t mat2_ld,
    ScalarType mat2_dtype,
    ScalarType mat2_scale_dtype,
-    ScalingType mat2_scaling_type,
+    at::blas::ScalingType mat2_scaling_type,
    const void* bias_ptr,
    ScalarType bias_dtype,
    void* result_ptr,
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -17,7 +17,7 @@ TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)());

 // The real implementation of CUDAHooksInterface
 struct CUDAHooks : public at::CUDAHooksInterface {
-  CUDAHooks(at::CUDAHooksArgs) {}
+  CUDAHooks(at::CUDAHooksArgs /*unused*/) {}
  void init() const override;
  Device getDeviceFromPtr(void* data) const override;
  bool isPinnedPtr(const void* data) const override;
--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@ -29,7 +29,7 @@

 namespace at::cuda::tunable {

-using at::cuda::blas::ScalingType;
+using at::blas::ScalingType;

 enum class BlasOp {
  N = 0,
--- a/aten/src/ATen/cuda/tunable/TunableOp.h
+++ b/aten/src/ATen/cuda/tunable/TunableOp.h
@ -29,7 +29,7 @@ template <typename ParamsT>
 class Callable {
  public:
    virtual ~Callable() = default;
-    virtual TuningStatus Call(const ParamsT*) {
+    virtual TuningStatus Call(const ParamsT* /*unused*/) {
      return FAIL;
    }
    virtual TuningStatus IsSupported(const ParamsT* params) {
--- a/aten/src/ATen/cudnn/Types.cpp
+++ b/aten/src/ATen/cudnn/Types.cpp
@ -2,6 +2,8 @@

 #include <ATen/ATen.h>

+#include <c10/util/Exception.h>
+
 namespace at::native {

 cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) {
@ -20,9 +22,10 @@ cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) {
  } else if (dtype == at::kByte) {
    return CUDNN_DATA_UINT8;
  }
-  std::string msg("getCudnnDataTypeFromScalarType() not supported for ");
-  msg += toString(dtype);
-  throw std::runtime_error(msg);
+  TORCH_CHECK(false,
+    "getCudnnDataTypeFromScalarType() not supported for ",
+    toString(dtype)
+  );
 }

 cudnnDataType_t getCudnnDataType(const at::Tensor& tensor) {
--- a/aten/src/ATen/detail/HPUHooksInterface.h
+++ b/aten/src/ATen/detail/HPUHooksInterface.h
@ -25,7 +25,7 @@ struct TORCH_API HPUHooksInterface : AcceleratorHooksInterface {
        false, "Cannot get device of pointer on HPU without HPU backend");
  }

-  bool isPinnedPtr(const void*) const override {
+  bool isPinnedPtr(const void* /*data*/) const override {
    return false;
  }

--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@ -410,7 +410,7 @@ struct ExistingBdimBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T..


 template <typename F, F Method, typename... ExtraArgs>
-Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t>, ExtraArgs... extra_args) {
+Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t> /*unused*/, ExtraArgs... extra_args) {
  INVOKE(self, Method)(std::forward<ExtraArgs>(extra_args)...);
  return self;
 }
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@ -12,6 +12,7 @@
 #include <ATen/native/IndexKernel.h>
 #include <ATen/native/IndexingUtils.h>
 #include <torch/library.h>
+#include <c10/util/Exception.h>


 // NOLINTBEGIN(bugprone-unchecked-optional-access)
@ -94,9 +95,10 @@ static std::vector<std::optional<Tensor>> batchIndices(
    if (index.has_value() && index->sym_numel() != 0) {
      const auto idx_bdim = indices_bdims[i];
      indices_.emplace_back(maybePadToLogicalRank(moveBatchDimToFront(index.value(), idx_bdim), idx_bdim, maxLogicalRank));
-      if (index.value().dtype() == kBool && indices_bdims[i].has_value()) {
-        throw std::runtime_error("vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask.");
-      }
+      TORCH_CHECK(
+        !(index.value().dtype() == kBool) || !indices_bdims[i].has_value(),
+        "vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask."
+      );
    } else {
      indices_.push_back(index);
    }
--- a/aten/src/ATen/functorch/Interpreter.h
+++ b/aten/src/ATen/functorch/Interpreter.h
@ -3,6 +3,7 @@
 #include <ATen/functorch/Macros.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/util/Exception.h>
 #include <optional>
 #include <bitset>
 #include <utility>
@ -106,9 +107,10 @@ struct VmapInterpreterMeta {

  template <typename T>
  friend void to_json(T& json_j, const VmapInterpreterMeta& json_t) {
-    if (json_t.batchSize_.is_heap_allocated()) {
-      throw std::runtime_error("Serialization for heap-allocated SymInt is not implemented yet");
-    }
+    TORCH_CHECK(
+      !json_t.batchSize_.is_heap_allocated(),
+      "Serialization for heap-allocated SymInt is not implemented yet"
+    );
    json_j["batchSize"] = json_t.batchSize_.as_int_unchecked();
    json_j["randomness"] = static_cast<int64_t>(json_t.randomness_);
  }
@ -302,7 +304,7 @@ struct Interpreter {
    } else if (meta.contains("Functionalize")) {
      json_t.meta_.emplace<FunctionalizeInterpreterMeta>(meta["Functionalize"].template get<FunctionalizeInterpreterMeta>());
    } else {
-      throw std::runtime_error("unknown interpreter metadata type");
+      TORCH_CHECK(false, "unknown interpreter metadata type");
    }
  }

--- a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
+++ b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
@ -6,6 +6,7 @@
 #include <ATen/functorch/BatchedTensorImpl.h>
 #include <ATen/Dispatch.h>
 #include <c10/util/irange.h>
+#include <c10/util/Exception.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/xnnpack/Engine.h>
@ -108,9 +109,7 @@ Tensor binary_cross_entropy_with_logits_hack(
 }

 Tensor trace_backward_decomp(const Tensor& grad, IntArrayRef sizes) {
-  if (sizes.size() != 2) {
-    throw std::runtime_error("expected matrix input");
-  }
+  TORCH_CHECK(sizes.size() == 2, "expected matrix input");
  auto grad_input = at::zeros(sizes[0] * sizes[1], grad.options());
  auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, grad.options().dtype(at::kLong));
  // Workaround using index_put instead of yet unsupported index_fill_
--- a/aten/src/ATen/metal/Context.h
+++ b/aten/src/ATen/metal/Context.h
@ -18,7 +18,7 @@ extern std::atomic<const MetalInterface*> g_metal_impl_registry;

 class MetalImplRegistrar {
 public:
-  explicit MetalImplRegistrar(MetalInterface*);
+  explicit MetalImplRegistrar(MetalInterface* /*impl*/);
 };

 at::Tensor& metal_copy_(at::Tensor& self, const at::Tensor& src);
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@ -2060,7 +2060,7 @@ std::tuple<Tensor, Tensor> linalg_lu_factor(const Tensor& A, bool pivot) {
 }

 // TODO Deprecate this function in favour of linalg_lu_factor_ex
-std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool) {
+std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool /*unused*/) {
   TORCH_WARN_ONCE(
    "torch.lu is deprecated in favor of torch.linalg.lu_factor / torch.linalg.lu_factor_ex and will be ",
    "removed in a future PyTorch release.\n",
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@ -1157,103 +1157,103 @@ REGISTER_AVX512_DISPATCH(cholesky_stub, &cholesky_kernel)
 REGISTER_AVX2_DISPATCH(cholesky_stub, &cholesky_kernel)
 REGISTER_VSX_DISPATCH(cholesky_stub, &cholesky_kernel)
 REGISTER_ZVECTOR_DISPATCH(cholesky_stub, &cholesky_kernel)
-REGISTER_SVE_DISPATCH(cholesky_stub, &cholesky_kernel)
+REGISTER_SVE256_DISPATCH(cholesky_stub, &cholesky_kernel)

 REGISTER_ARCH_DISPATCH(cholesky_inverse_stub, DEFAULT, &cholesky_inverse_kernel_impl)
 REGISTER_AVX512_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
 REGISTER_AVX2_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
 REGISTER_VSX_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
 REGISTER_ZVECTOR_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
-REGISTER_SVE_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
+REGISTER_SVE256_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)

 REGISTER_ARCH_DISPATCH(linalg_eig_stub, DEFAULT, &linalg_eig_kernel)
 REGISTER_AVX512_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
 REGISTER_AVX2_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
 REGISTER_VSX_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
 REGISTER_ZVECTOR_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
-REGISTER_SVE_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
+REGISTER_SVE256_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)

 REGISTER_ARCH_DISPATCH(linalg_eigh_stub, DEFAULT, &linalg_eigh_kernel)
 REGISTER_AVX512_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
 REGISTER_AVX2_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
 REGISTER_VSX_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
 REGISTER_ZVECTOR_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
-REGISTER_SVE_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
+REGISTER_SVE256_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)

 REGISTER_ARCH_DISPATCH(geqrf_stub, DEFAULT, &geqrf_kernel)
 REGISTER_AVX512_DISPATCH(geqrf_stub, &geqrf_kernel)
 REGISTER_AVX2_DISPATCH(geqrf_stub, &geqrf_kernel)
 REGISTER_VSX_DISPATCH(geqrf_stub, &geqrf_kernel)
 REGISTER_ZVECTOR_DISPATCH(geqrf_stub, &geqrf_kernel)
-REGISTER_SVE_DISPATCH(geqrf_stub, &geqrf_kernel)
+REGISTER_SVE256_DISPATCH(geqrf_stub, &geqrf_kernel)

 REGISTER_ARCH_DISPATCH(orgqr_stub, DEFAULT, &orgqr_kernel_impl)
 REGISTER_AVX512_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
 REGISTER_AVX2_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
 REGISTER_VSX_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
 REGISTER_ZVECTOR_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
-REGISTER_SVE_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
+REGISTER_SVE256_DISPATCH(orgqr_stub, &orgqr_kernel_impl)

 REGISTER_ARCH_DISPATCH(ormqr_stub, DEFAULT, &ormqr_kernel)
 REGISTER_AVX512_DISPATCH(ormqr_stub, &ormqr_kernel)
 REGISTER_AVX2_DISPATCH(ormqr_stub, &ormqr_kernel)
 REGISTER_VSX_DISPATCH(ormqr_stub, &ormqr_kernel)
 REGISTER_ZVECTOR_DISPATCH(ormqr_stub, &ormqr_kernel)
-REGISTER_SVE_DISPATCH(ormqr_stub, &ormqr_kernel)
+REGISTER_SVE256_DISPATCH(ormqr_stub, &ormqr_kernel)

 REGISTER_ARCH_DISPATCH(lstsq_stub, DEFAULT, &lstsq_kernel)
 REGISTER_AVX512_DISPATCH(lstsq_stub, &lstsq_kernel)
 REGISTER_AVX2_DISPATCH(lstsq_stub, &lstsq_kernel)
 REGISTER_VSX_DISPATCH(lstsq_stub, &lstsq_kernel)
 REGISTER_ZVECTOR_DISPATCH(lstsq_stub, &lstsq_kernel)
-REGISTER_SVE_DISPATCH(lstsq_stub, &lstsq_kernel)
+REGISTER_SVE256_DISPATCH(lstsq_stub, &lstsq_kernel)

 REGISTER_ARCH_DISPATCH(triangular_solve_stub, DEFAULT, &triangular_solve_kernel)
 REGISTER_AVX512_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
 REGISTER_AVX2_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
 REGISTER_VSX_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
 REGISTER_ZVECTOR_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
-REGISTER_SVE_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
+REGISTER_SVE256_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)

 REGISTER_ARCH_DISPATCH(lu_factor_stub, DEFAULT, &lu_factor_kernel)
 REGISTER_AVX512_DISPATCH(lu_factor_stub, &lu_factor_kernel)
 REGISTER_AVX2_DISPATCH(lu_factor_stub, &lu_factor_kernel)
 REGISTER_VSX_DISPATCH(lu_factor_stub, &lu_factor_kernel)
 REGISTER_ZVECTOR_DISPATCH(lu_factor_stub, &lu_factor_kernel)
-REGISTER_SVE_DISPATCH(lu_factor_stub, &lu_factor_kernel)
+REGISTER_SVE256_DISPATCH(lu_factor_stub, &lu_factor_kernel)

 REGISTER_ARCH_DISPATCH(ldl_factor_stub, DEFAULT, &ldl_factor_kernel)
 REGISTER_AVX512_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
 REGISTER_AVX2_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
 REGISTER_VSX_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
 REGISTER_ZVECTOR_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
-REGISTER_SVE_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
+REGISTER_SVE256_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)

 REGISTER_ARCH_DISPATCH(ldl_solve_stub, DEFAULT, &ldl_solve_kernel)
 REGISTER_AVX512_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
 REGISTER_AVX2_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
 REGISTER_VSX_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
 REGISTER_ZVECTOR_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
-REGISTER_SVE_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
+REGISTER_SVE256_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)

 REGISTER_ARCH_DISPATCH(lu_solve_stub, DEFAULT, &lu_solve_kernel)
 REGISTER_AVX512_DISPATCH(lu_solve_stub, &lu_solve_kernel)
 REGISTER_AVX2_DISPATCH(lu_solve_stub, &lu_solve_kernel)
 REGISTER_VSX_DISPATCH(lu_solve_stub, &lu_solve_kernel)
 REGISTER_ZVECTOR_DISPATCH(lu_solve_stub, &lu_solve_kernel)
-REGISTER_SVE_DISPATCH(lu_solve_stub, &lu_solve_kernel)
+REGISTER_SVE256_DISPATCH(lu_solve_stub, &lu_solve_kernel)

 REGISTER_ARCH_DISPATCH(svd_stub, DEFAULT, &svd_kernel)
 REGISTER_AVX512_DISPATCH(svd_stub, &svd_kernel)
 REGISTER_AVX2_DISPATCH(svd_stub, &svd_kernel)
 REGISTER_VSX_DISPATCH(svd_stub, &svd_kernel)
 REGISTER_ZVECTOR_DISPATCH(svd_stub, &svd_kernel)
-REGISTER_SVE_DISPATCH(svd_stub, &svd_kernel)
+REGISTER_SVE256_DISPATCH(svd_stub, &svd_kernel)

 REGISTER_ARCH_DISPATCH(unpack_pivots_stub, DEFAULT, &unpack_pivots_cpu_kernel)
 REGISTER_AVX512_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
 REGISTER_AVX2_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
 REGISTER_VSX_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
-REGISTER_SVE_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
+REGISTER_SVE256_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
 } // namespace at::native
--- a/aten/src/ATen/native/DispatchStub.cpp
+++ b/aten/src/ATen/native/DispatchStub.cpp
@ -39,21 +39,19 @@ static CPUCapability compute_cpu_capability() {
    }
 #elif defined(HAVE_SVE_CPU_DEFINITION)
    int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
-    if (envar == "sve") {
-      // Select SVE capability based on the maximum SVE VL supported by the HW.
+#ifdef HAVE_SVE256_CPU_DEFINITION
+    if (envar == "sve256") {
      if (sve_vl == 256) {
+#ifdef HAVE_ARM_BF16_CPU_DEFINITION
        if (cpuinfo_has_arm_bf16()) {
          return CPUCapability::SVE256;
        }
-      } else if (sve_vl == 128) {
-        if (cpuinfo_has_arm_bf16()) {
-          return CPUCapability::SVE128;
-        }
-      } else {
-        TORCH_WARN("SVE capability not available on hardware. Falling back to DEFAULT");
-        return CPUCapability::DEFAULT;
+#endif
      }
+      TORCH_WARN("SVE256 capability not available on hardware. Falling back to DEFAULT");
+      return CPUCapability::DEFAULT;
    }
+#endif
 #else
 #ifdef HAVE_AVX512_CPU_DEFINITION
    if (envar == "avx512") {
@ -115,11 +113,6 @@ static CPUCapability compute_cpu_capability() {
        #endif
        }
    #endif
-    #ifdef HAVE_SVE128_CPU_DEFINITION
-        if (sve_vl == 128) { // Check for SVE128
-            return CPUCapability::SVE128;
-        }
-    #endif
    // Return the default CPU capability.
    return CPUCapability::DEFAULT;
  }
@ -154,9 +147,6 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
 #ifdef HAVE_SVE256_CPU_DEFINITION
  , void *SVE256
 #endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-  , void *SVE128
-#endif
 ) {
  constexpr auto supported_devices = c10::array_of<c10::DeviceType>(
        c10::DeviceType::CPU,
@ -194,9 +184,6 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
          , SVE256
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-          , SVE128
 #endif
        );
        if (!std::holds_alternative<ErrorType>(result)) {
@ -255,9 +242,6 @@ void* DispatchStubImpl::get_call_ptr(
 #ifdef HAVE_SVE256_CPU_DEFINITION
  , void *SVE256
 #endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-  , void *SVE128
-#endif
 ) {

  auto result = try_get_call_ptr(
@ -282,10 +266,6 @@ void* DispatchStubImpl::get_call_ptr(
 #ifdef HAVE_SVE256_CPU_DEFINITION
      ,
      SVE256
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-      ,
-      SVE128
 #endif
  );
  if (std::holds_alternative<ErrorType>(result)) {
@ -320,9 +300,6 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
    , void *SVE256
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-    , void *SVE128
 #endif
  ){

@ -365,16 +342,6 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
      return DispatchResult(SVE256);
    }
  }
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-  if (capability >= static_cast<int>(CPUCapability::SVE128)) {
-    if (C10_UNLIKELY(!SVE128)) {
-      // dispatch to DEFAULT, since the SVE kernel is missing
-      return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
-    } else {
-      return DispatchResult(SVE128);
-    }
-  }
 #endif
  return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
 }
@ -396,9 +363,6 @@ void* DispatchStubImpl::choose_cpu_impl(
 #ifdef HAVE_SVE256_CPU_DEFINITION
  , void *SVE256
 #endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-  , void *SVE128
-#endif
 ) {
  auto capability = static_cast<int>(get_cpu_capability());
  (void)capability;
@ -444,17 +408,6 @@ void* DispatchStubImpl::choose_cpu_impl(
      return SVE256;
    }
  }
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-  if (capability >= static_cast<int>(CPUCapability::SVE128)) {
-    if (C10_UNLIKELY(!SVE128)) {
-      // dispatch to DEFAULT, since the SVE kernel is missing
-      TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
-      return DEFAULT;
-    } else {
-      return SVE128;
-    }
-  }
 #endif
  TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
  return DEFAULT;
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@ -64,9 +64,8 @@ enum class CPUCapability {
  VSX = 1,
 #elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
  ZVECTOR = 1,
-#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
+#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
  SVE256 = 1,
-  SVE128 = 2,
 #else
  AVX2 = 1,
  AVX512 = 2,
@ -118,9 +117,6 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
      , void *SVE256
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-      , void *SVE128
 #endif
  );

@ -142,9 +138,6 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
    , void *SVE256
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-    , void *SVE128
 #endif
  );

@ -166,9 +159,6 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
      , void *SVE256
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-      , void *SVE128
 #endif
  );

@ -193,9 +183,6 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
    , void *SVE256
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-    , void *SVE128
 #endif
  );

@ -253,9 +240,6 @@ private:
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
      , reinterpret_cast<void*>(SVE256)
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-      , reinterpret_cast<void*>(SVE128)
 #endif
      )
    );
@ -317,9 +301,6 @@ public:
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
      , reinterpret_cast<void*>(SVE256)
-#endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-      , reinterpret_cast<void*>(SVE128)
 #endif
      );
    if (std::holds_alternative<ErrorType>(result)){
@ -344,9 +325,6 @@ public:
 #ifdef HAVE_SVE256_CPU_DEFINITION
  static TORCH_API FnPtr SVE256;
 #endif
-#ifdef HAVE_SVE128_CPU_DEFINITION
-  static TORCH_API FnPtr SVE128;
-#endif
 private:
  DispatchStubImpl impl;
 };
@ -454,12 +432,6 @@ struct RegisterPRIVATEUSE1Dispatch {
 #define REGISTER_SVE256_DISPATCH(name, fn)
 #endif

-#ifdef HAVE_SVE128_CPU_DEFINITION
-#define REGISTER_SVE128_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, SVE128, fn)
-#else
-#define REGISTER_SVE128_DISPATCH(name, fn)
-#endif
-
 // Macro to register the same kernel for all CPU arch types. This is useful
 // if a kernel does not benefit from being recompiled across different arch types.
 #define REGISTER_ALL_CPU_DISPATCH(name, fn)                                    \
@ -468,11 +440,6 @@ struct RegisterPRIVATEUSE1Dispatch {
  REGISTER_AVX2_DISPATCH(name, fn)                                             \
  REGISTER_VSX_DISPATCH(name, fn)                                              \
  REGISTER_ZVECTOR_DISPATCH(name, fn)                                          \
-  REGISTER_SVE256_DISPATCH(name, fn)                                           \
-  REGISTER_SVE128_DISPATCH(name, fn)
-
-#define REGISTER_SVE_DISPATCH(name, fn)                                        \
-  REGISTER_SVE128_DISPATCH(name, fn)                                           \
  REGISTER_SVE256_DISPATCH(name, fn)

 #define REGISTER_NO_CPU_DISPATCH(name)                                         \
@ -515,7 +482,6 @@ struct RegisterPRIVATEUSE1Dispatch {
 // REGISTER_DISPATCH now dispatches an AVX512 kernel to nullptr but registers other dispatches.
 // ALSO_REGISTER_AVX512_DISPATCH should be used for ensuring AVX512 dispatch, among others.
 // ALSO_REGISTER_SVE256_DISPATCH should be used for ensuring SVE256 dispatch, among others.
-// ALSO_REGISTER_SVE128_DISPATCH should be used for ensuring SVE128 dispatch, among others.
 #ifdef CPU_CAPABILITY_AVX512
 #define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, ((void*)(fn) ? nullptr : nullptr))
 #else
@ -523,7 +489,6 @@ struct RegisterPRIVATEUSE1Dispatch {
 #endif
 #define ALSO_REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
 #define ALSO_REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
-#define ALSO_REGISTER_SVE128_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
 #endif
 } // namespace at::native

--- a/aten/src/ATen/native/Scalar.cpp
+++ b/aten/src/ATen/native/Scalar.cpp
@ -15,7 +15,11 @@ namespace at::native {

 Scalar item(const Tensor& self) {
  auto numel = self.sym_numel();
-  TORCH_CHECK(numel == 1, "a Tensor with ", numel, " elements cannot be converted to Scalar");
+  TORCH_SYM_CHECK(
+      numel.sym_eq(1),
+      "a Tensor with ",
+      numel,
+      " elements cannot be converted to Scalar");
  if (self.is_sparse()) {
    if (self._nnz() == 0) return Scalar(0);
    if (self.is_coalesced()) return at::_local_scalar_dense(self._values());
--- a/aten/src/ATen/native/SegmentReduce.cpp
+++ b/aten/src/ATen/native/SegmentReduce.cpp
@ -466,7 +466,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cp
 REGISTER_AVX512_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
 REGISTER_VSX_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
-REGISTER_SVE_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
+REGISTER_SVE256_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)

 // offsets dispatches
 REGISTER_ARCH_DISPATCH(
@ -477,7 +477,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cp
 REGISTER_AVX512_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
 REGISTER_VSX_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
-REGISTER_SVE_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
+REGISTER_SVE256_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)

 // Currently some computation is being duplicated across forward and backward.
 // TODO: Cache indices in forward pass to reuse in backward
@ -548,7 +548,7 @@ REGISTER_VSX_DISPATCH(
 REGISTER_ZVECTOR_DISPATCH(
    _segment_reduce_lengths_backward_stub,
    &_segment_reduce_cpu_lengths_backward_kernel)
-REGISTER_SVE_DISPATCH(
+REGISTER_SVE256_DISPATCH(
    _segment_reduce_lengths_backward_stub,
    &_segment_reduce_cpu_lengths_backward_kernel)

@ -568,7 +568,7 @@ REGISTER_VSX_DISPATCH(
 REGISTER_ZVECTOR_DISPATCH(
    _segment_reduce_offsets_backward_stub,
    &_segment_reduce_cpu_offsets_backward_kernel)
-REGISTER_SVE_DISPATCH(
+REGISTER_SVE256_DISPATCH(
    _segment_reduce_offsets_backward_stub,
    &_segment_reduce_cpu_offsets_backward_kernel)

--- a/Show More
+++ b/Show More