Add eager mode in inductor

2025-10-27 09:04:53 +08:00 · 2025-09-22 14:32:25 -07:00
653 changed files with 14286 additions and 20374 deletions
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
        try:
            with socket.create_connection((addr, port), timeout=timeout):
                return
-        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
+        except (ConnectionRefusedError, socket.timeout):  # noqa: PERF203
            if i == attempt_cnt - 1:
                raise
            time.sleep(timeout)
@ -1004,7 +1004,7 @@ if __name__ == "__main__":
        install_condaforge_python(host, args.python_version)
        sys.exit(0)
-    python_version = args.python_version if args.python_version is not None else "3.10"
+    python_version = args.python_version if args.python_version is not None else "3.9"
    if args.use_torch_from_pypi:
        configure_system(host, compiler=args.compiler, python_version=python_version)
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -69,8 +69,7 @@ RUN bash ./install_cuda.sh 13.0
 ENV DESIRED_CUDA=13.0
 FROM ${ROCM_IMAGE} as rocm
-ARG PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 ENV MKLROOT /opt/intel
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -36,12 +36,6 @@ case ${DOCKER_TAG_PREFIX} in
    ;;
  rocm*)
    BASE_TARGET=rocm
    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
    # add gfx950 conditionally starting in ROCm 7.0
    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
    fi
    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
    ;;
  *)
    echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -262,10 +262,13 @@ case "$tag" in
    TRITON_CPU=yes
    ;;
  pytorch-linux-jammy-linter)
-    PYTHON_VERSION=3.10
+    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
    # would be to upgrade mypy to 1.0.0 with Python 3.11
    PYTHON_VERSION=3.9
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.10
+    PYTHON_VERSION=3.9
    CUDA_VERSION=12.8.1
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@ -1 +1 @@
-v2.28.3-1
+v2.27.5-1
--- a/.ci/docker/ci_commit_pins/nccl-cu13.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu13.txt
@ -1 +1 @@
-v2.28.3-1
+v2.27.7-1
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-bbb06c0334a6772b92d24bde54956e675c8c6604
+5ae38bdb0dc066c5823e34dc9797afb9de42c866
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -12,8 +12,8 @@ function do_install() {
    rocm_version_nodot=${rocm_version//./}
-    # https://github.com/icl-utk-edu/magma/pull/65
+    # Version 2.7.2 + ROCm related updates
-    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
+    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
    rocm_dir="/opt/rocm"
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -40,16 +40,12 @@ case ${DOCKER_TAG_PREFIX} in
        ;;
    rocm*)
        # we want the patch version of 6.4 instead
-        if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
+        if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
        # add gfx950 conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
    *)
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -82,7 +82,7 @@ case ${image} in
        ;;
    manylinux2_28-builder:rocm*)
        # we want the patch version of 6.4 instead
-        if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
+        if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        TARGET=rocm_final
@ -90,10 +90,6 @@ case ${image} in
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
        # add gfx950 conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
    manylinux2_28-builder:xpu)
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -112,6 +112,8 @@ ninja==1.11.1.3
 #Pinned versions: 1.11.1.3
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
 numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
 numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
@ -132,7 +134,7 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==1.22.4; python_version == "3.10"
+numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
 numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
 numpy==2.1.2; python_version >= "3.13"
@ -324,6 +326,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 lxml==5.3.0
 #Description: This is a requirement of unittest-xml-reporting
 # Python-3.9 binaries
 PyGithub==2.3.0
 sympy==1.13.3
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,15 +1,8 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
 standard-imghdr==3.13.0; python_version >= "3.13"
 #Description: This is needed by Sphinx, so it needs to be added here.
 # The reasons are as follows:
 # 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
 # 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
 # Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -72,7 +72,7 @@ def sample_vllm_test_library():
                    ]
                ),
                "pytest -v -s entrypoints/llm/test_generate.py",
-                "pytest -v -s entrypoints/offline_mode",
+                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
            ],
        },
        "vllm_regression_test": {
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -1,11 +1,11 @@
 SHELL=/usr/bin/env bash
 DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 7.0
+DESIRED_ROCM ?= 6.4
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
-#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
+#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
 DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma-rocm/build_magma.sh
 .PHONY: all
 all: magma-rocm70
 all: magma-rocm64
 all: magma-rocm63
@ -25,11 +24,6 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output
 .PHONY: magma-rocm70
 magma-rocm70: DESIRED_ROCM := 7.0
 magma-rocm70:
 	$(DOCKER_RUN)
 .PHONY: magma-rocm64
 magma-rocm64: DESIRED_ROCM := 6.4
 magma-rocm64:
--- a/.ci/magma-rocm/build_magma.sh
+++ b/.ci/magma-rocm/build_magma.sh
@ -6,8 +6,8 @@ set -eou pipefail
 # The script expects DESIRED_CUDA and PACKAGE_NAME to be set
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-# https://github.com/icl-utk-edu/magma/pull/65
+# Version 2.7.2 + ROCm related updates
-MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
+MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
 # Folders for the build
 PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
 # Fetch magma sources and verify checksum
 pushd ${PACKAGE_DIR}
-git clone https://github.com/jeffdaily/magma
+git clone https://bitbucket.org/icl/magma.git
 pushd magma
 git checkout ${MAGMA_VERSION}
 popd
--- a/.ci/pytorch/cpp_doc_push_script.sh
+++ b/.ci/pytorch/cpp_doc_push_script.sh
@ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \
 # Build the docs
 pushd docs/cpp
-time make VERBOSE=1 html
+time make VERBOSE=1 html -j
 popd
 popd
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -55,7 +55,7 @@ test_python_shard() {
  setup_test_python
-  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS"
+  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS"
  assert_git_not_dirty
 }
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -322,29 +322,23 @@ test_python_shard() {
  # modify LD_LIBRARY_PATH to ensure it has the conda env.
  # This set of tests has been shown to be buggy without it for the split-build
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }
 test_python() {
  # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
  assert_git_not_dirty
 }
 test_python_smoke() {
-  # Smoke tests for H100/B200
+  # Smoke tests for H100
  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }
 test_python_smoke_b200() {
  # Targeted smoke tests for B200 - staged approach to avoid too many failures
  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }
 test_h100_distributed() {
  # Distributed tests at H100
  time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
@ -390,7 +384,6 @@ test_dynamo_wrapped_shard() {
    --exclude-distributed-tests \
    --exclude-torch-export-tests \
    --exclude-aot-dispatch-tests \
    --exclude-quantization-tests \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose \
    --upload-artifacts-while-running
@ -568,6 +561,43 @@ else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
 fi
 # Validate backend availability for dynamo_eager configs
 if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
  echo "Validating eager backend availability for TEST_CONFIG: ${TEST_CONFIG}"
  if ! python -c "import torch; backends = torch._dynamo.list_backends(); print('Available backends:', backends); assert 'eager' in backends, f'eager backend not available. Available: {backends}'"; then
    echo "ERROR: eager backend not available in this environment"
    echo "This might be due to missing dependencies or incorrect PyTorch installation"
    exit 1
  fi
  echo "eager backend validation successful"
  # Additional validation: test that torch.compile works with eager backend
  echo "Testing torch.compile with eager backend..."
  if ! python -c "
 import torch
 import torch._dynamo as dynamo
 def test_func(x):
    return x * 2
 # Test that eager backend works
 try:
    compiled_func = torch.compile(test_func, backend='eager')
    result = compiled_func(torch.tensor([1.0, 2.0]))
    print('torch.compile with eager backend test successful')
 except Exception as e:
    print(f'ERROR: torch.compile with eager backend failed: {e}')
    exit(1)
 "; then
    echo "ERROR: torch.compile with eager backend failed"
    exit 1
  fi
 fi
 # Debug logging for backend selection
 echo "TEST_CONFIG: ${TEST_CONFIG}"
 echo "DYNAMO_BENCHMARK_FLAGS: ${DYNAMO_BENCHMARK_FLAGS[*]}"
 test_cachebench() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
@ -629,6 +659,16 @@ test_perf_for_dashboard() {
  shift
  local backend=inductor
  # Allow surfacing eager metrics in CI by switching backend based on TEST_CONFIG
  if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
    backend=eager
  elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
    backend=aot_eager
  fi
  # Debug logging for backend selection in test_perf_for_dashboard
  echo "test_perf_for_dashboard: TEST_CONFIG=${TEST_CONFIG}, selected backend=${backend}"
  echo "DASHBOARD_TAG=${DASHBOARD_TAG}"
  local modes=()
  if [[ "$DASHBOARD_TAG" == *training-true* ]]; then
    modes+=(training)
@ -682,20 +722,37 @@ test_perf_for_dashboard() {
      fi
      if [[ "$DASHBOARD_TAG" == *default-true* ]]; then
-        $TASKSET python "benchmarks/dynamo/$suite.py" \
+        echo "Running benchmark: ${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
        echo "Command: $TASKSET python benchmarks/dynamo/$suite.py ${target_flag[*]} --$mode --$dtype --backend $backend --disable-cudagraphs $*"
        if ! $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"; then
          echo "ERROR: Benchmark failed for ${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
          echo "This might indicate an issue with the eager backend or benchmark configuration"
          exit 1
        fi
        echo "Benchmark completed successfully: ${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
      fi
      if [[ "$DASHBOARD_TAG" == *cudagraphs-true* ]]; then
-        $TASKSET python "benchmarks/dynamo/$suite.py" \
+        echo "Running benchmark: ${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
        if ! $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"; then
          echo "ERROR: Benchmark failed for ${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
          exit 1
        fi
        echo "Benchmark completed successfully: ${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
      fi
      if [[ "$DASHBOARD_TAG" == *dynamic-true* ]]; then
-        $TASKSET python "benchmarks/dynamo/$suite.py" \
+        echo "Running benchmark: ${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}"
        if ! $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \
            --dynamic-batch-only "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv"; then
          echo "ERROR: Benchmark failed for ${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}"
          exit 1
        fi
        echo "Benchmark completed successfully: ${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}"
      fi
      if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]]; then
        TORCHINDUCTOR_CPP_WRAPPER=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
@ -1163,12 +1220,6 @@ test_distributed() {
  fi
 }
 test_quantization() {
  echo "Testing quantization"
  python test/test_quantization.py
 }
 test_rpc() {
  echo "Testing RPC C++ tests"
  # NB: the ending test_rpc must match the current function name for the current
@ -1586,7 +1637,7 @@ test_executorch() {
 test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
        test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
-        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \
+        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
        distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
@ -1630,25 +1681,6 @@ test_operator_benchmark() {
      --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
 }
 test_operator_microbenchmark() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
  TEST_DIR=$(pwd)
  cd benchmarks/operator_benchmark/pt_extension
  python -m pip install .
  cd "${TEST_DIR}"/benchmarks/operator_benchmark
  for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
      --benchmark-name "PyTorch operator microbenchmark" --use-compile
    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \
      --benchmark-name "PyTorch operator microbenchmark"
  done
 }
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
@ -1681,8 +1713,6 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
  test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
  test_python_legacy_jit
 elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
  test_quantization
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
  # TODO: run some C++ tests
  echo "no-op at the moment"
@ -1705,8 +1735,6 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
    test_operator_benchmark cpu ${TEST_MODE}
  fi
 elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
  test_operator_microbenchmark
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@ -1809,14 +1837,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  test_xpu_bin
 elif [[ "${TEST_CONFIG}" == smoke ]]; then
  test_python_smoke
 elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
  test_python_smoke_b200
 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
  test_h100_distributed
 elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
  test_h100_symm_mem
 elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then
  test_h100_symm_mem
 elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
  test_h100_cutlass_backend
 else
--- a/.ci/pytorch/win-test-helpers/test_python_shard.bat
+++ b/.ci/pytorch/win-test-helpers/test_python_shard.bat
@ -25,7 +25,7 @@ echo Copying over test times file
 robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"
 echo Run nn tests
-python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
+python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
 if ERRORLEVEL 1 goto fail
 popd
--- a/.ci/pytorch/windows/internal/static_lib_test.bat
+++ b/.ci/pytorch/windows/internal/static_lib_test.bat
@ -63,7 +63,7 @@ if errorlevel 1 exit /b 1
 call %CONDA_HOME%\condabin\activate.bat testenv
 if errorlevel 1 exit /b 1
-call conda install  -y -q -c conda-forge libuv=1.51
+call conda install  -y -q -c conda-forge libuv=1.39
 call conda install -y -q intel-openmp
 echo "install and test libtorch"
--- a/.clang-tidy
+++ b/.clang-tidy
@ -69,8 +69,6 @@ readability-string-compare,
 '
 HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
 WarningsAsErrors: '*'
 LineFilter:
  - name: '/usr/include/.*'
 CheckOptions:
  cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true
  cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -22,9 +22,6 @@ self-hosted-runner:
    - linux.arm64.m7g.4xlarge
    - linux.arm64.m7g.4xlarge.ephemeral
    - linux.arm64.r7g.12xlarge.memory
    - linux.aws.h100
    - linux.aws.h100.4
    - linux.aws.h100.8
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@ -59,7 +59,7 @@ runs:
        set -x
        # Create new py_tmp env with python-version
-        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv
+        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp
        PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
        EXIT_CODE=$?
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-da63274d9f3d06ba5815b5c8786a7194923a0234
+367a480bd3534edf27a8dac3c6f7ea8af9d1ed45
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -525,21 +525,6 @@
  - Lint
  - pull
 - name: typechecking
  patterns:
  - 'pyrefly.toml'
  - 'mypy.ini'
  - 'mypy-strict.ini'
  approved_by:
  - lolpack
  - maggiemoss
  - ndmitchell
  - kinto0
  mandatory_checks_name:
  - EasyCLA
  - Lint
  - pull
 - name: superuser
  patterns:
  - '*'
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -1,44 +1,41 @@
 tracking_issue: 24422
 ciflow_tracking_issue: 64124
 ciflow_push_tags:
 - ciflow/b200
 - ciflow/b200-symm-mem
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
- ciflow/h100
+- ciflow/triton_binaries
 - ciflow/h100-cutlass-backend
 - ciflow/h100-distributed
 - ciflow/h100-symm-mem
 - ciflow/inductor
 - ciflow/inductor-cu126
 - ciflow/inductor-micro-benchmark
 - ciflow/inductor-micro-benchmark-cpu-x86
 - ciflow/inductor-perf-compare
 - ciflow/inductor-perf-test-nightly-rocm
 - ciflow/inductor-perf-test-nightly-x86-zen
 - ciflow/inductor-periodic
 - ciflow/inductor-rocm
 - ciflow/inductor-perf-test-nightly-rocm
 - ciflow/inductor-perf-compare
 - ciflow/inductor-micro-benchmark
 - ciflow/inductor-micro-benchmark-cpu-x86
 - ciflow/inductor-perf-test-nightly-x86-zen
 - ciflow/inductor-cu126
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
 - ciflow/op-benchmark
 - ciflow/periodic
 - ciflow/periodic-rocm-mi300
 - ciflow/pull
 - ciflow/quantization-periodic
 - ciflow/riscv64
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
 - ciflow/riscv64
 - ciflow/slow
 - ciflow/torchbench
 - ciflow/triton_binaries
 - ciflow/trunk
 - ciflow/unstable
 - ciflow/vllm
 - ciflow/win-arm64
 - ciflow/xpu
 - ciflow/vllm
 - ciflow/torchbench
 - ciflow/op-benchmark
 - ciflow/pull
 - ciflow/h100
 - ciflow/h100-distributed
 - ciflow/win-arm64
 - ciflow/h100-symm-mem
 - ciflow/h100-cutlass-backend
 retryable_workflows:
 - pull
 - trunk
@ -47,4 +44,4 @@ retryable_workflows:
 - inductor-A100-perf-nightly
 labeler_config: labeler.yml
 label_to_label_config: label_to_label.yml
-mergebot: true
+mergebot: True
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -30,7 +30,7 @@ CUDA_ARCHES_CUDNN_VERSION = {
 }
 # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
-ROCM_ARCHES = ["6.4", "7.0"]
+ROCM_ARCHES = ["6.3", "6.4"]
 XPU_ARCHES = ["xpu"]
@ -53,7 +53,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
-        "nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
@ -70,7 +70,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
-        "nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
@ -87,7 +87,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
-        "nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | "
+        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -155,7 +155,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        package_type="manywheel",
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
-            arches=["13.0"],
+            arches=["12.8"],
            python_versions=["3.12"],
        ),
        branches="main",
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -71,15 +71,12 @@ jobs:
    with:!{{ upload.binary_env_as_input(config) }}
      {%- if "aarch64" in build_environment %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      {%- elif "s390x" in build_environment %}
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      timeout-minutes: 420
      {%- elif config["gpu_arch_type"] == "rocm" %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      timeout-minutes: 300
      {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.24xlarge.ephemeral
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -67,7 +67,7 @@ jobs:
            # an OOM issue when running the job, so this upgrades the runner from 4xlarge
            # to the next available tier of 12xlarge. So much memory just to generate cpp
            # doc
-            runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory
+            runner: ${{ inputs.runner_prefix }}linux.12xlarge
            # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now)
            # Let's try to figure out how this can be improved
            timeout-minutes: 360
--- a/.github/workflows/_get-changed-files.yml
+++ b/.github/workflows/_get-changed-files.yml
@ -2,12 +2,6 @@ name: Get Changed Files
 on:
  workflow_call:
    inputs:
      all_files:
        description: "Whether to return all files instead of just changed files"
        required: false
        type: boolean
        default: false
    outputs:
      changed-files:
        description: "List of changed files (space-separated) or '*' if not in a PR"
@ -32,23 +26,17 @@ jobs:
            # Get the PR number from the github context
            PR_NUMBER="${{ github.event.number }}"
-            # Check if all_files is requested
+            # Use gh CLI to get changed files in the PR with explicit repo
-            if [ "${{ inputs.all_files }}" = "true" ]; then
+            CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
              echo "all_files input is true, returning all files"
              echo "changed-files=*" >> "$GITHUB_OUTPUT"
            else
              # Use gh CLI to get changed files in the PR with explicit repo
              CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
-              if [ -z "$CHANGED_FILES" ]; then
+            if [ -z "$CHANGED_FILES" ]; then
-                echo "No changed files found, setting to '*'"
+              echo "No changed files found, setting to '*'"
-                CHANGED_FILES="*"
+              CHANGED_FILES="*"
              fi
              echo "Changed files: $CHANGED_FILES"
              echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
            fi
            echo "Changed files: $CHANGED_FILES"
            echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
          else
            echo "Not in PR context, setting changed files to '*'"
            echo "changed-files=*" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -273,8 +273,6 @@ jobs:
          TEST_CONFIG: ${{ matrix.config }}
          SHARD_NUMBER: ${{ matrix.shard }}
          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
          EXTRA_FLAGS: ${{ matrix.extra_flags || '' }}
          OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }}
          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
--- a/.github/workflows/b200-symm-mem.yml
+++ b/.github/workflows/b200-symm-mem.yml
@ -1,60 +0,0 @@
 name: Limited CI for symmetric memory tests on B200
 on:
  pull_request:
    paths:
      - .github/workflows/b200-symm-mem.yml
  workflow_dispatch:
  push:
    tags:
      - ciflow/b200-symm-mem/*
  schedule:
    - cron: 22 8 * * *  # about 1:22am PDT
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true
 permissions:
  id-token: write
  contents: read
 jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
  linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm:
    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
      test-matrix: |
        { include: [
          { config: "b200-symm-mem", shard: 1, num_shards: 1, runner: "linux.dgx.b200.8" },
        ]}
    secrets: inherit
  linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
    uses: ./.github/workflows/_linux-test.yml
    needs:
      - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.test-matrix }}
      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
    secrets: inherit
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -36,7 +36,7 @@ jobs:
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
-        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "rocm7.0", "cpu"]
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
    steps:
      - name: Build docker image
        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -52,8 +52,8 @@ jobs:
          { tag: "cuda12.9" },
          { tag: "cuda12.8" },
          { tag: "cuda12.6" },
          { tag: "rocm6.3"  },
          { tag: "rocm6.4"  },
          { tag: "rocm7.0"  },
          { tag: "cpu"      },
        ]
    steps:
--- a/.github/workflows/build-magma-rocm-linux.yml
+++ b/.github/workflows/build-magma-rocm-linux.yml
@ -34,7 +34,7 @@ jobs:
      id-token: write
    strategy:
      matrix:
-        rocm_version: ["70", "64"]
+        rocm_version: ["64", "63"]
    steps:
      - name: Checkout PyTorch
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -52,8 +52,8 @@ jobs:
          { name: "manylinuxaarch64-builder",       tag: "cuda13.0",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.6",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm7.0",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxcxx11-abi-builder",     tag: "cpu-cxx11-abi",     runner: "linux.9xlarge.ephemeral" },
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -50,12 +50,12 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
        device: ["cuda", "rocm", "xpu", "aarch64"]
        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
          - device: "rocm"
-            rocm_version: "7.0"
+            rocm_version: "6.4"
            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
          - device: "cuda"
            rocm_version: ""
@ -108,6 +108,9 @@ jobs:
          # Determine python executable for given version
          case $PY_VERS in
          3.9)
            PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
            ;;
          3.10)
            PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
            ;;
@ -191,7 +194,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
        device: ["xpu"]
    timeout-minutes: 40
    env:
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -35,7 +35,6 @@ jobs:
      contents: write
    outputs:
      pt_release_name: ${{ steps.release_name.outputs.pt_release_name }}
      pt_pep517_release_name: ${{ steps.release_name.outputs.pt_pep517_release_name }}
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
@ -54,12 +53,8 @@ jobs:
          tag_or_branch="${tag_or_branch#refs/heads/}"
          # replace directory separators with _ in branch name
          tag_or_branch="${tag_or_branch//\//_}"
-          torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')"
+          echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
-          {
+          echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
            echo "PT_RELEASE_NAME=pytorch-$tag_or_branch";
            echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz";
            echo "PT_PEP517_RELEASE_FILE=torch-${torch_version}.tar.gz";
          } >> "$GITHUB_ENV"
      - name: Checkout optional submodules
        run: python3 tools/optional_submodules.py
      - name: Copy docs requirements for inclusion
@ -69,47 +64,30 @@ jobs:
          cp .ci/docker/requirements-docs.txt docs/requirements.txt
      - name: Create source distribution
        run: |
-          # Create new folder with specified name so extracting the archive yields that
+            # Create new folder with specified name so extracting the archive yields that
-          rm -rf "/tmp/$PT_RELEASE_NAME"
+            rm -rf "/tmp/$PT_RELEASE_NAME"
-          cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
+            cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
-          mv "/tmp/$PT_RELEASE_NAME" .
+            mv "/tmp/$PT_RELEASE_NAME" .
-          # Cleanup
+            # Cleanup
-          rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
+            rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
-          find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
+            find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
-          # Create archive
+            # Create archive
-          tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
+            tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
-          echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
+            echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
      - name: Create PEP 517 compatible source distribution
        run: |
          pip install build==1.2.2.post1 || exit 1
          python -m build --sdist || exit 1
          cd dist || exit 1
      - name: Upload source distribution for release
        if: ${{ github.event_name == 'release' }}
        uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2
        with:
-          files: |
+          files: ${{env.PT_RELEASE_FILE}}
-            ${{ env.PT_RELEASE_FILE }}
+      - name: Upload source distribution to GHA artifacts for release tags
            ${{ env.PT_PEP517_RELEASE_FILE }}
      - name: Upload source distribution to GHA artifacts  # for release tags
        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
        with:
          name: ${{ env.PT_RELEASE_FILE }}
          path: ${{ env.PT_RELEASE_FILE }}
      - name: Upload PEP 517 source distribution to GHA artifacts  # for release tags
        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
        with:
          name: ${{ env.PT_PEP517_RELEASE_FILE }}
          path: dist/${{ env.PT_PEP517_RELEASE_FILE }}
      - name: Set output
        id: release_name
-        run: |
+        run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"
          {
            echo "pt_release_name=${{ env.PT_RELEASE_FILE }}";
            echo "pt_pep517_release_name=${{ env.PT_PEP517_RELEASE_FILE }}";
          } >> "${GITHUB_OUTPUT}"
  upload_source_code_to_s3:
    if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
@ -125,9 +103,6 @@ jobs:
      - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
        with:
          name: ${{ needs.release.outputs.pt_release_name }}
      - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
        with:
          name: ${{ needs.release.outputs.pt_pep517_release_name }}
      - name: Configure AWS credentials(PyTorch account)
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
@ -138,9 +113,7 @@ jobs:
          s3-bucket: pytorch
          s3-prefix: source_code/test
          if-no-files-found: warn
-          path: |
+          path: ${{ needs.release.outputs.pt_release_name }}
            ${{ needs.release.outputs.pt_release_name }}
            ${{ needs.release.outputs.pt_pep517_release_name }}
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -70,7 +70,7 @@ jobs:
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
          pytorch-linux-jammy-py3-clang12-executorch,
          pytorch-linux-jammy-py3.12-triton-cpu,
          pytorch-linux-noble-riscv64-py3.12-gcc14
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -62,7 +62,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -128,11 +128,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -174,11 +174,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -220,11 +220,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -265,7 +265,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -331,11 +331,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -377,11 +377,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -423,11 +423,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -468,7 +468,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -534,11 +534,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -580,11 +580,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -626,11 +626,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -671,7 +671,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -737,11 +737,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -783,11 +783,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -829,11 +829,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -874,7 +874,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -940,11 +940,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -986,11 +986,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1032,11 +1032,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1077,7 +1077,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -1143,11 +1143,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1189,11 +1189,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1235,11 +1235,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1280,7 +1280,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -1346,11 +1346,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1392,11 +1392,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1438,11 +1438,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -316,6 +316,120 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-rocm6_3-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.3
      GPU_ARCH_VERSION: "6.3"
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-rocm6_3-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-rocm6_3-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
      - libtorch-rocm6_3-shared-with-deps-release-build
      - get-label-type
    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.3
      GPU_ARCH_VERSION: "6.3"
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
          name: libtorch-rocm6_3-shared-with-deps-release
          path: "${{ runner.temp }}/artifacts/"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: ROCm set GPU_FLAG
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
      - name: configure aws credentials
        id: aws_creds
        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
          aws-region: us-east-1
          role-duration-seconds: 18000
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
          docker-image-name: libtorch-cxx11-builder
          custom-tag-prefix: rocm6.3
          docker-build-dir: .ci/docker
          working-directory: pytorch
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
        env:
          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
      - name: Teardown ROCm
        uses: ./.github/actions/teardown-rocm
  libtorch-rocm6_3-shared-with-deps-release-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
      id-token: write
      contents: read
    needs: libtorch-rocm6_3-shared-with-deps-release-test
    with:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.3
      GPU_ARCH_VERSION: "6.3"
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      build_name: libtorch-rocm6_3-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-rocm6_4-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -333,7 +447,6 @@ jobs:
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      timeout-minutes: 300
      build_name: libtorch-rocm6_4-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
@ -430,118 +543,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-rocm7_0-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm7.0
      GPU_ARCH_VERSION: "7.0"
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      timeout-minutes: 300
      build_name: libtorch-rocm7_0-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-rocm7_0-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
      - libtorch-rocm7_0-shared-with-deps-release-build
      - get-label-type
    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm7.0
      GPU_ARCH_VERSION: "7.0"
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
          name: libtorch-rocm7_0-shared-with-deps-release
          path: "${{ runner.temp }}/artifacts/"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: ROCm set GPU_FLAG
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
      - name: configure aws credentials
        id: aws_creds
        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
          aws-region: us-east-1
          role-duration-seconds: 18000
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
          docker-image-name: libtorch-cxx11-builder
          custom-tag-prefix: rocm7.0
          docker-build-dir: .ci/docker
          working-directory: pytorch
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
        env:
          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
      - name: Teardown ROCm
        uses: ./.github/actions/teardown-rocm
  libtorch-rocm7_0-shared-with-deps-release-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
      id-token: write
      contents: read
    needs: libtorch-rocm7_0-shared-with-deps-release-test
    with:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm7.0
      GPU_ARCH_VERSION: "7.0"
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      build_name: libtorch-rocm7_0-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -42,7 +42,7 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_12-cuda13_0-build:
+  manywheel-py3_12-cuda12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
@ -51,22 +51,22 @@ jobs:
      PACKAGE_TYPE: manywheel
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
+      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_VERSION: "12.8"
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_12-cuda13_0
+      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda13_0-test:  # Testing
+  manywheel-py3_12-cuda12_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
-      - manywheel-py3_12-cuda13_0-build
+      - manywheel-py3_12-cuda12_8-build
      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
@ -74,13 +74,13 @@ jobs:
      PACKAGE_TYPE: manywheel
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
+      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_VERSION: "12.8"
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda13_0
+      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@ -60,7 +60,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      timeout-minutes: 300
      build_name: manywheel-py3_10-rocm6_4
      build_environment: linux-binary-manywheel-rocm
    secrets:
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -57,7 +57,7 @@ on:
        description: The list of configs used the benchmark
        required: false
        type: string
-        default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench
+        default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,dynamo_eager_huggingface_perf,dynamo_eager_timm_perf,dynamo_eager_torchbench_perf,cachebench
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@ -97,18 +97,35 @@ jobs:
          { config: "inductor_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" },
          { config: "inductor_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" },
          { config: "inductor_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" },
          { config: "dynamo_eager_huggingface_perf", shard: 1, num_shards: 5, runner: "linux.aws.a100" },
          { config: "dynamo_eager_huggingface_perf", shard: 2, num_shards: 5, runner: "linux.aws.a100" },
          { config: "dynamo_eager_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" },
          { config: "dynamo_eager_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" },
          { config: "dynamo_eager_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" },
          { config: "inductor_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
          { config: "inductor_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
          { config: "inductor_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
          { config: "inductor_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
          { config: "inductor_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
          { config: "inductor_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
          { config: "dynamo_eager_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
          { config: "dynamo_eager_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
          { config: "dynamo_eager_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
          { config: "dynamo_eager_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
          { config: "dynamo_eager_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
          { config: "dynamo_eager_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
          { config: "inductor_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
          { config: "inductor_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
          { config: "inductor_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
          { config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
          { config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
          { config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
          { config: "dynamo_eager_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
          { config: "dynamo_eager_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
          { config: "dynamo_eager_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
          { config: "dynamo_eager_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
          { config: "dynamo_eager_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
          { config: "dynamo_eager_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
          { config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
          { config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
        ]}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -31,8 +31,6 @@ jobs:
    if: github.repository_owner == 'pytorch'
    name: Get changed files
    uses: ./.github/workflows/_get-changed-files.yml
    with:
      all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}
  lintrunner-clang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -55,7 +53,7 @@ jobs:
    with:
      timeout: 120
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter
+      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
      fetch-depth: 0
@ -266,10 +264,10 @@ jobs:
        with:
          submodules: false
          fetch-depth: 1
-      - name: Setup Python 3.10
+      - name: Setup Python 3.9
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
-          python-version: '3.10'
+          python-version: '3.9'
          architecture: x64
          cache: pip
      - name: Install dependencies
--- a/.github/workflows/operator_microbenchmark.yml
+++ b/.github/workflows/operator_microbenchmark.yml
@ -1,46 +0,0 @@
 name: operator_microbenchmark
 on:
  push:
    tags:
      - ciflow/op-benchmark/*
  workflow_dispatch:
  schedule:
    # Run at 06:00 UTC everyday
    - cron: 0 6 * * *
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true
 permissions:
  id-token: write
  contents: read
 jobs:
  opmicrobenchmark-build:
    if: github.repository_owner == 'pytorch'
    name: opmicrobenchmark-build
    uses: ./.github/workflows/_linux-build.yml
    with:
      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '8.0 9.0'
      test-matrix: |
        { include: [
          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
        ]}
    secrets: inherit
  opmicrobenchmark-test:
    name: opmicrobenchmark-test
    uses: ./.github/workflows/_linux-test.yml
    needs: opmicrobenchmark-build
    with:
      timeout-minutes: 500
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
      docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -127,8 +127,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      # More memory is needed to build with asan
      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/quantization-periodic.yml
+++ b/.github/workflows/quantization-periodic.yml
@ -1,54 +0,0 @@
 name: quantization-periodic
 on:
  push:
    tags:
      - ciflow/quantization-periodic/*
  workflow_dispatch:
  schedule:
    # run weekly
    - cron: "45 0 * * 0"
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
 permissions:
  id-token: write
  contents: read
 jobs:
  get-default-label-prefix:
    name: get-default-label-prefix
    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf
  periodic-quantization-build:
    name: periodic-quantization-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '8.9'
      test-matrix: |
        { include: [
          { config: "quantization", shard: 1, num_shards: 1, runner: "${{ needs.get-default-label-prefix.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
        ]}
    secrets: inherit
  periodic-test-quantization:
    name: periodic-test-quantization
    uses: ./.github/workflows/_linux-test.yml
    needs: periodic-quantization-build
    with:
      build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
      docker-image: ${{ needs.periodic-quantization-build.outputs.docker-image }}
      test-matrix: ${{ needs.periodic-quantization-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -140,8 +140,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      # More memory is needed to build with asan
      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/test-b200.yml
+++ b/.github/workflows/test-b200.yml
@ -1,76 +0,0 @@
 # B200 Smoke Tests CI Workflow
 #
 # This workflow runs smoke tests on B200 hardware
 #
 # Flow:
 # 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
 # 2. Runs smoke tests on linux.dgx.b200 runner
 # 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
 #
 # Triggered by:
 # - Pull requests modifying this workflow file
 # - Manual dispatch
 # - Schedule (every 6 hours)
 # - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag)
 name: B200 Smoke Tests
 on:
  pull_request:
    paths:
      - .github/workflows/test-b200.yml
  workflow_dispatch:
  schedule:
    - cron: 0 4,10,16,22 * * *  # every 6 hours
  push:
    tags:
      - ciflow/b200/*
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true
 permissions:
  id-token: write
  contents: read
 jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
  linux-jammy-cuda12_8-py3_10-gcc11-sm100-build:
    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
      test-matrix: |
        { include: [
          { config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
        ]}
      # config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh
    secrets: inherit
  linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
    uses: ./.github/workflows/_linux-test.yml
    needs:
      - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
    secrets: inherit
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@ -53,3 +53,27 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
  linux-jammy-py3_9-clang9-xla-build:
    name: linux-jammy-py3_9-clang9-xla
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-clang9-xla
      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
      test-matrix: |
        { include: [
          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
        ]}
    secrets: inherit
  linux-jammy-py3_9-clang9-xla-test:
    name: linux-jammy-py3_9-clang9-xla
    uses: ./.github/workflows/_linux-test.yml
    needs: linux-jammy-py3_9-clang9-xla-build
    with:
      build-environment: linux-jammy-py3.9-clang9-xla
      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -82,7 +82,6 @@ torch/return_types.pyi
 torch/nn/functional.pyi
 torch/utils/data/datapipes/datapipe.pyi
 torch/csrc/autograd/generated/*
 torch/csrc/functionalization/generated/*
 torch/csrc/lazy/generated/*.[!m]*
 torch_compile_debug/
 # Listed manually because some files in this directory are not generated
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -49,7 +49,7 @@ init_command = [
    'mccabe==0.7.0',
    'pycodestyle==2.14.0',
    'pyflakes==3.4.0',
-    'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"',
+    'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
 ]
@ -153,7 +153,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
+    'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
    'numpy==2.1.0 ; python_version >= "3.12"',
    'expecttest==0.3.0',
    'mypy==1.16.0',
@ -196,7 +196,6 @@ exclude_patterns = [
    'tools/test/gen_operators_yaml_test.py',
    'tools/test/gen_oplist_test.py',
    'tools/test/test_selective_build.py',
    'tools/experimental/dynamic_shapes/torchfuzz/**',
 ]
 command = [
    'python3',
@ -1453,7 +1452,7 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    'usort==1.0.8.post1',
    'isort==6.0.1',
-    'ruff==0.13.1',  # sync with RUFF
+    'ruff==0.12.9',  # sync with RUFF
 ]
 is_formatter = true
@ -1587,7 +1586,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.13.1',  # sync with PYFMT
+    'ruff==0.12.9',  # sync with PYFMT
 ]
 is_formatter = true
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -91,8 +91,6 @@ generated_cpu_cpp = [
    "aten/src/ATen/NativeMetaFunctions.h",
    "aten/src/ATen/RegistrationDeclarations.h",
    "aten/src/ATen/VmapGeneratedPlumbing.h",
    "aten/src/ATen/ViewMetaClasses.h",
    "aten/src/ATen/ViewMetaClasses.cpp",
    "aten/src/ATen/core/aten_interned_strings.h",
    "aten/src/ATen/core/enum_tag.h",
    "aten/src/ATen/core/TensorBody.h",
@ -835,6 +833,36 @@ pybind_extension(
    ],
 )
 cc_library(
    name = "functorch",
    hdrs = glob([
        "functorch/csrc/dim/*.h",
    ]),
    srcs = glob([
        "functorch/csrc/dim/*.cpp",
    ]),
    deps = [
        ":aten_nvrtc",
        ":torch_python",
        "@pybind11",
    ],
 )
 pybind_extension(
    name = "functorch/_C",
    copts=[
        "-DTORCH_EXTENSION_NAME=_C"
    ],
    srcs = [
        "functorch/csrc/init_dim_only.cpp",
    ],
    deps = [
        ":functorch",
        ":torch_python",
        ":aten_nvrtc",
    ],
 )
 cc_binary(
    name = "torch/bin/torch_shm_manager",
    srcs = [
@ -875,6 +903,7 @@ py_library(
    ],
    data = [
        ":torch/_C.so",
        ":functorch/_C.so",
        ":torch/bin/torch_shm_manager",
    ],
 )
@ -1077,7 +1106,6 @@ test_suite(
        "aten/src/ATen/templates/LazyNonNativeIr.h",
        "aten/src/ATen/templates/RegisterDispatchKey.cpp",
        "aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
        "aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp",
        "aten/src/ATen/native/native_functions.yaml",
        "aten/src/ATen/native/tags.yaml",
        "aten/src/ATen/native/ts_native_functions.yaml",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,4 +1,5 @@
 cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 # cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)
 # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
 # sometimes makes XCode C compiler gets detected as "Clang", even when the C++
@ -442,7 +443,7 @@ if(WIN32)
      message(
        WARNING
          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
-          "Please run command 'conda install -c conda-forge libuv=1.51' to install libuv."
+          "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
      )
    else()
      set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../)
@ -1390,6 +1391,10 @@ endif()
 include(cmake/Summary.cmake)
 caffe2_print_configuration_summary()
 if(BUILD_FUNCTORCH)
  add_subdirectory(functorch)
 endif()
 # Parse custom debug info
 if(DEFINED USE_CUSTOM_DEBINFO)
  string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
@ -1481,4 +1486,4 @@ else()
    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
    ]])
  endif()
-endif()
+endif()
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,61 +1,20 @@
 # Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html
-# Include individual top-level files
+# Include source files in SDist
-include CITATION.cff
+include CMakeLists.txt
-include CODEOWNERS
+include *.bzl *.bazel .bazel* BUILD *.BUILD BUILD.* WORKSPACE
-include Dockerfile
+include BUCK BUCK.*
-include LICENSE
+include requirements*.txt
-include MANIFEST.in
+include version.txt
-include Makefile
+include [Mm]akefile *.[Mm]akefile [Mm]akefile.*
-include NOTICE
+include [Dd]ockerfile *.[Dd]ockerfile [Dd]ockerfile.* .dockerignore
 include .bc-linter.yml
 include .clang-format .clang-tidy
 include .cmakelintrc
 include .coveragerc
 include .dockerignore
 include .editorconfig
 include .flake8
 include .gdbinit
 include .lintrunner.toml
 include .lldbinit
 include codex_setup.sh
 include docker.Makefile
 include pyrefly.toml
 include ubsan.supp
 # Include bazel and BUCK related files
 include BUILD.bazel BUCK.oss
 include WORKSPACE
 include *.bzl
 include .bazelignore .bazelrc .bazelversion
 # Include general configuration files
 include *.ini
 # Include important top-level information
 include *.md
 # Include technical text files at the moment, comprises
 # version.txt, CMakeLists.txt, requirements.txt
 include *.txt
 # Include ctags configuration
 include .ctags.d/*.ctags
 # Include subfolders completely
 graft .devcontainer
 graft .vscode
 graft android
 graft aten
 graft benchmarks
 graft binaries
 graft c10
 graft caffe2
 graft cmake
 graft docs
 graft functorch
 graft ios
 graft mypy_plugins
 graft scripts
 graft test
 graft third_party
 graft tools
 graft torch
@ -63,37 +22,29 @@ graft torchgen
 # FIXME: torch-xla build during codegen will fail if include this file in wheel
 exclude torchgen/BUILD.bazel
-# The following exclusions omit parts from third-party dependencies that
+# Misc files and directories in SDist
-# contain invalid symlinks[1] and that are not needed for pytorch, such as
+include *.md
-# bindings for unused languages
+include CITATION.cff
-prune third_party/flatbuffers/java
+include LICENSE NOTICE
-prune third_party/flatbuffers/kotlin
+include mypy*.ini
-prune third_party/ittapi/rust
+graft benchmarks
-prune third_party/nccl/pkg/debian
+graft docs
-prune third_party/opentelemetry-cpp/third_party/prometheus-cpp/cmake/project-import-*
+graft mypy_plugins
-
+graft scripts
 # The following document is also an invalid symlink[1] and superfluous
 exclude third_party/flatbuffers/docs/source/CONTRIBUTING.md
 # Omit autogenerated code
 prune torchgen/packaged
 # Omit caches, compiled, and scm related content
 prune */__pycache__
 prune **/.github
 prune **/.gitlab
 global-exclude *.o *.obj *.so *.dylib *.a *.pxd *.dll *.lib
 global-exclude *.py[cod] *.swp *~
 global-exclude .git .git-blame-ignore-revs .gitattributes .gitignore .gitmodules
 global-exclude .gitlab-ci.yml
 # Misc files needed for custom setuptools command
 include .gitignore
 include .gitmodules
-# [1] Invalid symlinks for the purposes of Python source distributions are,
+# Include test suites in SDist
-# according to the source distribution format[2] links pointing outside the
+graft test
-# destination directory or links with a `..` component, which is those of
+include pytest.ini
-# concern here.
+include .coveragerc
-# [2] https://packaging.python.org/en/latest/specifications/source-distribution-format/#source-distribution-archive-features
+# Prune generated/compiled files
 prune torchgen/packaged
 prune */__pycache__
 global-exclude *.o *.obj *.so *.a *.dylib *.pxd *.dll *.lib *.py[cod]
 prune */.git
 global-exclude .git *~ *.swp
--- a/README.md
+++ b/README.md
@ -161,7 +161,7 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
 #### Prerequisites
 If you are installing from source, you will need:
- Python 3.10 or later
+- Python 3.9 or later
 - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
 - Visual Studio or Visual Studio Build Tool (Windows only)
@ -275,7 +275,7 @@ conda install pkg-config libuv
 pip install mkl-static mkl-include
 # Add these packages if torch.distributed is needed.
 # Distributed package support on Windows is a prototype feature and is subject to changes.
-conda install -c conda-forge libuv=1.51
+conda install -c conda-forge libuv
 ```
 #### Install PyTorch
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -317,20 +317,10 @@ IF(USE_FBGEMM_GENAI)
        -greedy-reverse-local-assignment=1
        -fhip-new-launch-api)
      # Only compile for gfx942 for now.
      # This is rather hacky, I could not figure out a clean solution :(
      set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
      string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
      if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
        list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
      endif()
      set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
      hip_add_library(
        fbgemm_genai STATIC
        ${fbgemm_genai_native_rocm_hip}
        HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
      set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
      set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
      target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -401,13 +401,30 @@ T* toDLPackImpl(const Tensor& src) {
  // The following code detects whether the src follows
  // a continuous pattern. If the src follows such pattern (common-case)
  // then we do not need to normalize the strides.
-  bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1;
+  bool need_normalize_strides = false;
  int64_t expected_stride = 1;
  for (int i = src.dim() - 1; i >= 0; i--) {
    // detect if we do not meet continuous pattern
    // and the size is 1, so there is opportunity to normalize
    if (src.stride(i) != expected_stride && src.size(i) == 1) {
      need_normalize_strides = true;
      break;
    }
    expected_stride *= src.size(i);
  }
  // less common case, try normalizing the strides
  if (need_normalize_strides) {
    // create a new tensor with possibly normalized strides
    // gh-83069
    auto shape = src.sizes();
-    view = src.as_strided(shape, {1}, src.storage_offset());
+    auto strides = src.strides().vec();
    for (int i = 0; i < src.dim(); i++) {
      if (shape[i] < 2) {
        strides[i] = 1;
      }
    }
    view = src.as_strided(shape, strides, src.storage_offset());
  }
  ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -468,7 +468,7 @@ inline Tensor _sum_to(
      // if we assume no reduction due to unbacked we ensure that at runtime.
      TORCH_MAYBE_SYM_CHECK(
          sym_eq(shape[i - leading_dims], sizes[i]),
-          "non-reduction path was assumed due to unbacked symbols expected those two sizes to be the same:",
+          "non-reduction path was assumed due to unabcked symbols expected those two sizes to be the same:",
          shape[i - leading_dims],
          ", ",
          sizes[i])
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@ -9,6 +9,11 @@
 namespace at::functionalization {
 ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
  if (out_idx == this->out_index) return *this;
  return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx);
 }
 // Note [Functionalization: Alias Removal Part 2]
 // See Note [Functionalization: Alias Removal] for more details.
 // This function applies a single update from one of the views to the StorageImpl.
@ -37,12 +42,12 @@ namespace at::functionalization {
 static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
  at::Tensor t = update.new_val;
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  if (update.view_metas.empty()) { return t; }
+  if (update.view_metas.empty()) return t;
  std::vector<at::Tensor> tmp_values({base});
  tmp_values.reserve(update.view_metas.size());
  for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
-    at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back());
+    at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
    // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
    // All of these ops require additional information to recover the sizes of the original tensor.
    // If need to, we could probably apply this optimization and only bother computing tmp_values
@ -50,8 +55,9 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co
    tmp_values.push_back(std::move(next_view));
  }
  for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) {
    int64_t out_idx = update.view_metas[i].out_index;
    // Each view inverse is implemented in ViewInverses.cpp.
-    t = update.view_metas[i]->reverse(tmp_values[i], t);
+    t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
  }
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
  return t;
@ -105,13 +111,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
 }
-void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) {
+void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
  TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");
  if (metas.size() > 1) {
    for (size_t i = 1; i < metas.size(); ++i) {
      // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
-      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided,
+      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
 "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
 " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
 "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "
--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@ -8,89 +8,44 @@ namespace at::functionalization {
 // See Note [Functionalization Pass In Core]
 enum class InverseReturnMode {
  /// Specifies that functional inverses should always return a view.
  AlwaysView,
  /// Specifies that functional inverses should always return a non-view / copy.
  NeverView,
  /// Specifies that functional inverses should return a view unless a (copying)
  /// scatter
  /// inverse exists, in which case that will be used instead.
  /// This avoids as_strided() calls that can be difficult for subclasses to
  /// handle.
  ViewOrScatterInverse,
 };
 #define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \
  static const char* name() {                 \
    return #TYPE;                             \
  }
 #define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \
  using SerializableTuple = std::tuple<__VA_ARGS__>
 // ViewMeta is a class used by the functionalization pass to navigate between
 // a base tensor and a view tensor.
 // For example, if I call `b = a.view1(...)`
-// the functionalization pass will generate and store a ViewMeta specialization
+// the functionalization pass will generate and store a ViewMeta on b that looks
-// for `view1` operation on b that looks like:
+// like:
 //
-// struct TORCH_API view1_ViewMeta : public ViewMeta {
+// ViewMeta(
-//   FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta);
+//   [<captures>](const Tensor& base, int64_t mutated_view_idx) {
-//   FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
+//     return base.view1(...);
-//       bool /* reapply_views */,
+//   },
-//       const std::vector<int64_t>&);
+//   [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
-//
+//   int64_t mutated_view_idx) -> at::Tensor {
-//   view1_ViewMeta(const SerializableTuple& tpl)
+//     return at::functionalization::impl::view1_inverse(base, mutated_view,
-//       : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
+//     ...);
 //
 //   view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
 //       : ViewMeta(/*has_symbolic_inputs=*/false),
 //         reapply_views(reapply_views),
 //         size(size) {}
 //
 //   Tensor forward(const Tensor& base) override {
 //       return base.view1(...);
 //   }
 //
-//   Tensor reverse(const Tensor& base, const Tensor& mutated_view) override {
+// The forward_fn lambda describes how to replay view1 on a tensor.
 //       return at::functionalization::impl::view1_inverse(base, mutated_view,
 //       ...);
 //   }
 //
-//   SerializableTuple to_serializable_tuple() {
+// The reverse_fn lambda describes how, given a tensor that is already a view,
 //     return std::make_tuple(reapply_views, size);
 //   }
 //
 //   bool reapply_views;
 //   std::vector<int64_t> size;
 // };
 //
 // The forward function describes how to replay view1 on a tensor.
 //
 // The reverse function describes how, given a tensor that is already a view,
 // how to get the corresponding base tensor. See Note [Functionalization Pass:
 // View Inverses] for details.
 //
 // `SerializedTuple` is a typedef that defines an `std::tuple<...>` type
 // representing the `ViewMeta` instance state. Methods that take in/return such
 // a type are used for supporting pickle serialization.
 struct ViewMeta {
  ViewMeta(
      std::function<Tensor(const Tensor&, int64_t)> forward,
      std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
      bool has_symbolic_inputs,
      bool is_multi_output = false,
      bool is_as_strided = false,
      int64_t out_idx = 0)
-      : out_index(out_idx),
+      : forward_fn(std::move(forward)),
        reverse_fn(std::move(reverse)),
        out_index(out_idx),
        is_multi_output(is_multi_output),
        is_as_strided(is_as_strided),
        has_symbolic_inputs(has_symbolic_inputs) {}
-  virtual ~ViewMeta() = default;
+  std::function<Tensor(const Tensor&, int64_t)> forward_fn;
-
+  std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
  virtual Tensor forward(const Tensor& base) = 0;
  virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
  // See Note [out_idx in ViewMeta]
  int64_t out_index;
@ -102,17 +57,10 @@ struct ViewMeta {
  // Tells us if this view operation has any symbolic inputs
  bool has_symbolic_inputs;
-  // Returns a new ViewMeta with the same forward/reverse
+  // Returns a copy of the current ViewMeta, if out_idx matches the current
  // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
  // functions, but a new out index.
-  //
+  ViewMeta to_out_idx(int64_t out_idx);
  // This method should be implemented by those `ViewMeta` that have more than
  // one output.
  virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) {
    TORCH_CHECK_NOT_IMPLEMENTED(
        false,
        "ViewMeta::to_out_index not implemented. ",
        "Likely because there's only one output.");
  }
 };
 // FunctionalStorageImpl is a subclass of StorageImpl used by the
@ -145,14 +93,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
    const at::Tensor new_val;
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
-    const std::vector<std::shared_ptr<ViewMeta>> view_metas;
+    const std::vector<ViewMeta> view_metas;
  };
  explicit FunctionalStorageImpl(const Tensor& value);
  void add_update(
      const Tensor& updated_val,
-      const std::vector<std::shared_ptr<ViewMeta>>& view_metas);
+      const std::vector<ViewMeta>& view_metas);
  bool apply_updates();
  const Tensor& base() {
    return base_;
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -129,19 +129,17 @@ void FunctionalTensorWrapper::freeze_storage() const {
 // - view_value: The output tensor that we need to wrap.
 // - base: The "base" of the view that `view_value` was generated from.
 // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
-FunctionalTensorWrapper::FunctionalTensorWrapper(
+FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
-    const Tensor& view_value,
+  : c10::TensorImpl(
-    const FunctionalTensorWrapper* base,
+      c10::DispatchKeySet(DispatchKey::Functionalize),
-    const std::shared_ptr<functionalization::ViewMeta>& meta)
+      view_value.dtype(),
-    : c10::TensorImpl(
+      base->storage().data_ptr().device()
-          c10::DispatchKeySet(DispatchKey::Functionalize),
+    ),
-          view_value.dtype(),
+    value_(view_value),
-          base->storage().data_ptr().device()),
+    is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
-      value_(view_value),
+    was_storage_changed_(base->was_storage_changed_),
-      is_multi_output_view_(
+    is_symbolic_(base->is_symbolic_)
-          base->is_multi_output_view_ || meta->is_multi_output),
+{
      was_storage_changed_(base->was_storage_changed_),
      is_symbolic_(base->is_symbolic_) {
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
  TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
  set_constructor_metadata();
@ -150,10 +148,11 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(
      view_metas_ = base->view_metas_;  // copy
  }
  view_metas_.push_back(meta);
-  maybe_mark_symbolic(meta.get());
+  maybe_mark_symbolic(meta);
  storage_ = base->storage_; // alias this tensor's storage with the base tensor's
 }
 functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
  return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
 }
@ -177,18 +176,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
 }
 // See Note [Functionalization Pass - Inplace View Ops]
-void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) {
+void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
  view_metas_.push_back(meta);
  // Manually track the fact that this tensor received a metadata mutation!
  has_metadata_mutation_ = true;
  // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
-  maybe_mark_symbolic(meta.get());
+  maybe_mark_symbolic(meta);
  // Note [Functionalization Pass - Inplace View Ops]
  // So, these ops are special - they're mutation AND view ops. They get special codegen.
  // An example is transpose_, e.g. `a.transpose_()`
  // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
  at::AutoDispatchSkipFunctionalize guard;
-  value_ = meta->forward(value_);
+  value_ = meta.forward_fn(value_, meta.out_index);
  TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
 }
@ -369,8 +368,15 @@ void FunctionalTensorWrapper::sync_() {
  regenerate_from_base();
 }
-const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const {
+Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
-  return view_metas_;
+  auto t = base;
  // Reapply views to get the viewed tensor from the base in alias_
  for (auto& view_meta: view_metas_) {
    t = view_meta.forward_fn(t, view_meta.out_index);
  }
  return t;
 }
 void FunctionalTensorWrapper::regenerate_from_base() {
@ -379,7 +385,7 @@ void FunctionalTensorWrapper::regenerate_from_base() {
  auto t = storage_impl->base();
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_);
+  t = apply_view_metas(t);
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
  replace_(t, /*from_lazy_regenerate=*/true);
@ -721,11 +727,11 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) {
 }
 bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
-  if (t_list.empty()) { return false; }
+  if (t_list.empty()) return false;
  auto functional_count = 0;
  for (const auto i : c10::irange(t_list.size())) {
    auto const & e= t_list[i];
-    if (!e.has_value() || !e->defined()) { continue; }
+    if (!e.has_value() || !e->defined()) continue;
    if (isFunctionalTensor(e)) {
      ++functional_count;
    }
@ -735,10 +741,10 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
 template <typename T>
 static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
-  if (list.size() == 0) { return false; }
+  if (list.size() == 0) return false;
  auto functional_count = 0;
  for (const auto& tensor : list) {
-    if (!tensor.defined()) { continue; }
+    if (!tensor.defined()) continue;
    if (isFunctionalTensor(tensor)) {
      ++functional_count;
    }
@ -756,28 +762,20 @@ void freeze_functional_tensor(const Tensor& tensor) {
  functional_base_impl->freeze_storage();
 }
-Tensor create_functional_tensor_with_view_meta(
+Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
    const at::Tensor& view_to_wrap,
    const at::Tensor& base,
    const std::shared_ptr<functionalization::ViewMeta>& meta,
    int64_t out_idx) {
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
  auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
  auto meta_ = meta;
  if (out_idx != 0) {
    // Note [out_idx in ViewMeta]
    // When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
    // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
-    meta_ = meta->to_out_index(out_idx);
+    meta = meta.to_out_idx(out_idx);
  }
-  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_);
+  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
 }
-std::vector<Tensor> create_functional_tensor_with_view_meta(
+std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
    ITensorListRef view_to_wrap,
    const at::Tensor& base,
    const std::shared_ptr<functionalization::ViewMeta>& meta) {
  std::vector<Tensor> outputs(view_to_wrap.size());
  int64_t i = 0;
  for (const auto& tensor : view_to_wrap) {
@ -787,22 +785,12 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(
  return outputs;
 }
-void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) {
+void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
  auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
  self_impl->mutate_view_meta(meta);
 }
 Tensor apply_view_meta_sequence(
    const Tensor& base,
    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) {
  Tensor r = base;
  for (auto& vm : sequence) {
    r = vm->forward(r);
  }
  return r;
 }
 // Note [Propagating strides in the functionalization pass]
 // In order to properly compute stride information, the functionalization pass
 // calls each {view} reference implementations with meta tensors.
@ -896,7 +884,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s
    const auto& ivalue = returns[idx];
    if (ivalue.isTensor()) {
      const auto& t = ivalue.toTensor();
-      if (!t.defined()) { continue; }
+      if (!t.defined()) continue;
      at::functionalization::impl::sync(t);
      auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
      (*stack)[returns_begin + idx] = t_new;
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  explicit FunctionalTensorWrapper(
      const Tensor& view_value,
      const FunctionalTensorWrapper* base,
-      const std::shared_ptr<functionalization::ViewMeta>& meta);
+      const functionalization::ViewMeta& meta);
  // Get the underlying, actual tensor, that doesn't know anything about
  // functionalization.
@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
        ->are_all_mutations_under_no_grad_or_inference_mode();
  }
-  void maybe_mark_symbolic(functionalization::ViewMeta* meta) {
+  void maybe_mark_symbolic(const functionalization::ViewMeta& meta) {
-    is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs;
+    is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs;
  }
  bool is_symbolic() const {
    return is_symbolic_;
  }
-  // Retrieves the ViewMeta sequence of this tensor.
+  // Runs the forward_fn of every ViewMeta collected in the current instance
-  const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas()
+  // to some other base.
-      const;
+  Tensor apply_view_metas(const Tensor& base);
  // Sync's the underlying tensor with its alias, if it's out of date. This
  // involves two steps: 1) Apply any pending updates/mutations to the alias 2)
@ -146,8 +146,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  // from the base tensor. This method is used by inplace-view ops like
  // transpose_. It appends a ViewMeta to the existing stack, and refreshes the
  // tensor by replaying the views off of the alias.
-  void mutate_view_meta(
+  void mutate_view_meta(const at::functionalization::ViewMeta& meta);
      const std::shared_ptr<at::functionalization::ViewMeta>& meta);
  // Custom implementation of self.set_(src)
  void set__impl(const FunctionalTensorWrapper* other);
@ -286,7 +285,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  bool is_symbolic_ = false;
  size_t generation_ = 0;
-  std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_;
+  std::vector<at::functionalization::ViewMeta> view_metas_;
 protected:
  static void copy_tensor_metadata(
@ -378,20 +377,16 @@ TORCH_API void propagate_xla_data_direct(
 Tensor create_functional_tensor_with_view_meta(
    const Tensor& view_to_wrap,
    const Tensor& base,
-    const std::shared_ptr<functionalization::ViewMeta>& meta,
+    functionalization::ViewMeta meta,
    int64_t out_idx = 0);
 std::vector<Tensor> create_functional_tensor_with_view_meta(
    ITensorListRef view_to_wrap,
    const Tensor& base,
-    const std::shared_ptr<functionalization::ViewMeta>& meta);
+    const functionalization::ViewMeta& meta);
 void mutate_view_meta(
    const Tensor& self,
-    const std::shared_ptr<functionalization::ViewMeta>& meta);
+    const functionalization::ViewMeta& meta);
 TORCH_API Tensor apply_view_meta_sequence(
    const Tensor& base,
    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);
 void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
 void set_sizes_strides_offset(
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@ -1,5 +1,3 @@
 #include <ATen/FunctionalizeFallbackKernel.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/EmptyTensor.h>
@ -9,6 +7,7 @@
 #include <torch/library.h>
 #include <c10/util/irange.h>
 #include <c10/util/strides.h>
 #include <ATen/EmptyTensor.h>
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/ATen.h>
@ -29,31 +28,6 @@
 #include <utility>
 #endif
 namespace at::functionalization {
 Tensor resize__ViewMeta::forward(const Tensor& base) {
  if (reapply_views) {
    return base.as_strided(size, c10::contiguous_strides(size));
  } else {
    return at::as_strided_copy(base, size, c10::contiguous_strides(size));
  }
 }
 Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
  return base.as_strided_scatter(
      mutated_view, size, c10::contiguous_strides(size));
 }
 Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) {
  return at::_unsafe_view_symint(base, size);
 }
 Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
  return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
 }
 } // namespace at::functionalization
 namespace {
  void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) {
    const auto& schema = op.schema();
@ -132,9 +106,7 @@ namespace {
      const auto& ivalue = returns[idx];
      if (ivalue.isTensor() && should_wrap_outputs) {
        const auto& t = ivalue.toTensor();
-        if (!t.defined()) {
+        if (!t.defined()) continue;
          continue;
        }
        auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
        (*stack)[returns_begin + idx] = t_new;
      } else if (ivalue.isTensorList() && should_wrap_outputs) {
@ -197,8 +169,19 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
  // The output of resizing is equivalent to taking a slice of a larger tensor.
  // We have to emulate this "slicing" with an as_strided call.
  auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
-  auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>(
+  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-      reapply_views, size.vec());
+    [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
      if (reapply_views) {
        return base.as_strided(size, c10::contiguous_strides(size));
      } else {
        return at::as_strided_copy(base, size, c10::contiguous_strides(size));
      }
    },
    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
      return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
    },
    /*has_symbolic_inputs=*/false
  );
  at::functionalization::impl::mutate_view_meta(self, view_meta);
  return self;
 }
@ -317,11 +300,17 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
    tmp_output = at::_unsafe_view_symint(self_, size);
  }
-  bool has_symbolic_inputs = std::any_of(
+  bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
-      size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
+
-  auto view_meta =
+  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-      std::make_shared<at::functionalization::_unsafe_view_ViewMeta>(
+    [size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-          has_symbolic_inputs, size.vec());
+      return at::_unsafe_view_symint(base, size);
    },
    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
      return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
    },
    /*has_symbolic_inputs=*/has_symbolic_inputs
  );
  auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta));
  // See  Note [Propagating strides in the functionalization pass]
--- a/aten/src/ATen/FunctionalizeFallbackKernel.h
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.h
@ -1,58 +0,0 @@
 #pragma once
 #include <ATen/FunctionalStorageImpl.h>
 namespace at::functionalization {
 // `ViewMeta` implementation for `resize_` operation.
 struct TORCH_API resize__ViewMeta : public ViewMeta {
  FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta)
  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
      bool /* reapply_views */,
      const std::vector<int64_t>&);
  resize__ViewMeta(const SerializableTuple& tpl)
      : resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
  resize__ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
      : ViewMeta(/*has_symbolic_inputs=*/false),
        reapply_views(reapply_views),
        size(size) {}
  Tensor forward(const Tensor& base) override;
  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
  SerializableTuple to_serializable_tuple() {
    return std::make_tuple(reapply_views, size);
  }
  bool reapply_views;
  std::vector<int64_t> size;
 };
 // `ViewMeta` implementation for `_unsafe_view` operation.
 struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta {
  FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta)
  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
      bool /* has_symbolic_inputs */,
      const std::vector<c10::SymInt>&);
  _unsafe_view_ViewMeta(const SerializableTuple& tpl)
      : _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
  _unsafe_view_ViewMeta(
      bool has_symbolic_inputs,
      const std::vector<c10::SymInt>& size)
      : ViewMeta(has_symbolic_inputs), size(size) {}
  Tensor forward(const Tensor& base) override;
  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
  SerializableTuple to_serializable_tuple() {
    return std::make_tuple(has_symbolic_inputs, size);
  }
  std::vector<c10::SymInt> size;
 };
 } // namespace at::functionalization
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@ -45,39 +45,7 @@ inline void infer_size_impl(
    }
  }
-  if (infer_dim) {
+  auto set_infer_dim = [&]() {
    // numel is the product of known sizes, it has to be divisible by newsize.
    // and newsize should be positive unless newsize == numel (we throw
    // different) error message in that case.
    if constexpr (std::is_same_v<NumelType, c10::SymInt>) {
      auto v = newsize.maybe_as_int();
      if (v and *v == 0) {
        // Avoid div by 0 when sym_eq(numel % newsize, 0) is constructed!
        // which may happen when newsize is not a symbol! if its a symbol
        // division won't happen anyway during compile.
        TORCH_MAYBE_SYM_CHECK(
            numel == newsize,
            "shape '",
            shape,
            "' is invalid for input of size ",
            numel);
      } else {
        auto cond = sym_gt(newsize, 0)
                        .sym_and(sym_eq(numel % newsize, 0))
                        .sym_or(sym_eq(numel, newsize));
        TORCH_MAYBE_SYM_CHECK(
            cond, "shape '", shape, "' is invalid for input of size ", numel);
      }
    } else {
      TORCH_CHECK(
          (newsize > 0 && (numel % newsize == 0)) || numel == newsize,
          "shape '",
          shape,
          "' is invalid for input of size ",
          numel);
    }
    // We have a degree of freedom here to select the dimension size; follow
    // NumPy semantics and just bail.  However, a nice error message is needed
    // because users often use `view` as a way to flatten & unflatten
@ -86,15 +54,19 @@ inline void infer_size_impl(
    // works yet
    //   empty_tensor.view(-1, 0)
    // doesn't.
-    TORCH_MAYBE_SYM_CHECK(
+    TORCH_CHECK(
        newsize != 0,
        "cannot reshape tensor of 0 elements into shape ",
        shape,
        " because the unspecified dimension size -1 can be any "
        "value and is ambiguous");
    res[*infer_dim] = numel / newsize;
    return;
  };
  if (infer_dim && newsize > 0 && numel % newsize == 0) {
    set_infer_dim();
    return;
  }
  TORCH_MAYBE_SYM_CHECK(
@ -103,6 +75,9 @@ inline void infer_size_impl(
      shape,
      "' is invalid for input of size ",
      numel);
  if (infer_dim) {
    set_infer_dim();
  }
 }
 inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) {
--- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp
@ -1,22 +1,32 @@
 #include <ATen/core/PythonOpRegistrationTrampoline.h>
 #include <c10/core/impl/PyInterpreterHooks.h>
 // TODO: delete this
 namespace at::impl {
-c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::interpreter_ = nullptr;
+// The strategy is that all python interpreters attempt to register themselves
 // as the main interpreter, but only one wins.  Only that interpreter is
 // allowed to interact with the C++ dispatcher.  Furthermore, when we execute
 // logic on that interpreter, we do so hermetically, never setting pyobj field
 // on Tensor.
 std::atomic<c10::impl::PyInterpreter*>
    PythonOpRegistrationTrampoline::interpreter_{nullptr};
 c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() {
-  return c10::impl::getGlobalPyInterpreter();
+  return PythonOpRegistrationTrampoline::interpreter_.load();
 }
 bool PythonOpRegistrationTrampoline::registerInterpreter(
    c10::impl::PyInterpreter* interp) {
-  if (interpreter_ != nullptr) {
+  c10::impl::PyInterpreter* expected = nullptr;
  interpreter_.compare_exchange_strong(expected, interp);
  if (expected != nullptr) {
    // This is the second (or later) Python interpreter, which means we need
    // non-trivial hermetic PyObject TLS
    c10::impl::HermeticPyObjectTLS::init_state();
    return false;
  } else {
    return true;
  }
  interpreter_ = interp;
  return true;
 }
 } // namespace at::impl
--- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
@ -2,21 +2,19 @@
 #include <ATen/core/dispatch/Dispatcher.h>
-// TODO: We can get rid of this
+// TODO: this can probably live in c10
 namespace at::impl {
 // Manages the single Python interpreter instance for PyTorch.
 class TORCH_API PythonOpRegistrationTrampoline final {
-  static c10::impl::PyInterpreter* interpreter_;
+  static std::atomic<c10::impl::PyInterpreter*> interpreter_;
 public:
-  // Register the Python interpreter. Returns true on first registration,
+  //  Returns true if you successfully registered yourself (that means
-  // false if an interpreter was already registered.
+  //  you are in the hot seat for doing the operator registrations!)
  static bool registerInterpreter(c10::impl::PyInterpreter*);
  // Returns the registered interpreter via the global PyInterpreter hooks.
  // Returns nullptr if no interpreter has been registered yet.
  static c10::impl::PyInterpreter* getInterpreter();
 };
--- a/aten/src/ATen/cpu/vec/vec_quant.h
+++ b/aten/src/ATen/cpu/vec/vec_quant.h
@ -149,105 +149,5 @@ static inline void pack_vnni4(
 #endif
 }
 // This is a helper function for transpose_pack_vnni4
 // Transform a [4, 16] block (with incontiguous output)
 // Src:
 // a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16
 // b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15 b16
 // c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16
 // d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 d16
 // Dst:
 // a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d1 d2 d3 d4
 // a5 a6 a7 a8 b5 b6 b7 b8 c5 c6 c7 c8 d5 d6 d7 d8
 // a9 a10 a11 a12 b9 b10 b11 b12 c9 c10 c11 c12 d9 d10 d11 d12
 // a13 a14 a15 a16 b13 b14 b15 b16 c13 c14 c15 c16 d13 d14 d15 d16
 template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
 static inline void transpose_vnni4_pad_4x16_block(
    const scalar_t* src,
    scalar_t* dst,
    int64_t ld_src,
    int64_t ld_dst,
    int krem = 4) {
 #if defined(CPU_CAPABILITY_AVX512)
  __m128i r[4];
  for (int i = 0; i < krem; ++i) {
    r[i] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * ld_src));
  }
  for (int i = krem; i < 4; ++i) {
    r[i] = _mm_setzero_si128();
  }
  // Transpose 4x16 bytes using unpack and shuffle
  __m128i t0 = _mm_unpacklo_epi32(r[0], r[1]);
  __m128i t1 = _mm_unpackhi_epi32(r[0], r[1]);
  __m128i t2 = _mm_unpacklo_epi32(r[2], r[3]);
  __m128i t3 = _mm_unpackhi_epi32(r[2], r[3]);
  __m128i r0 = _mm_unpacklo_epi64(t0, t2);
  __m128i r1 = _mm_unpackhi_epi64(t0, t2);
  __m128i r2 = _mm_unpacklo_epi64(t1, t3);
  __m128i r3 = _mm_unpackhi_epi64(t1, t3);
  // Store output
  if (krem == 4) {
    // normal case
    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), r0);
    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst), r1);
    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 2), r2);
    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 3), r3);
  } else {
    // masked case
    __mmask16 mask = (1ULL << (krem * 4)) - 1;
    _mm_mask_storeu_epi8(dst, mask, r0);
    _mm_mask_storeu_epi8(reinterpret_cast<__m128i*>(dst + ld_dst), mask, r1);
    _mm_mask_storeu_epi8(
        reinterpret_cast<__m128i*>(dst + ld_dst * 2), mask, r2);
    _mm_mask_storeu_epi8(
        reinterpret_cast<__m128i*>(dst + ld_dst * 3), mask, r3);
  }
 #else
  TORCH_CHECK(
      false,
      "transpose_vnni4_pad_4x16_block is only supported when AVX-512 is supported")
 #endif
 }
 // Do the transpose packing fusion with VNNI4
 // Reorder [K, N] → [N/4, K, 4] (VNNI4-style layout for bit8)
 template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
 static inline void transpose_pack_vnni4(
    const scalar_t* src,
    scalar_t* dst,
    int64_t ld_src,
    int64_t K,
    int64_t N) {
 #if defined(CPU_CAPABILITY_AVX512)
  TORCH_CHECK(
      N % 16 == 0, "N needs to be multiple of 16 for transpose_pack_vnni4");
  int64_t bk = 0;
  int64_t _K = K / 4 * 4;
  for (; bk < _K; bk += 4) {
    int64_t bn = 0;
    for (; bn < N; bn += 16) {
      transpose_vnni4_pad_4x16_block(
          src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4);
    }
  }
  // Handle leftover K rows (< 4)
  if (K % 4 != 0) {
    int krem = K - bk;
    int64_t bn = 0;
    for (; bn < N; bn += 16) {
      transpose_vnni4_pad_4x16_block(
          src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4, krem);
    }
  }
 #else
  TORCH_CHECK(
      false, "transpose_pack_vnni4 is only supported when AVX-512 is supported")
 #endif
 }
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1637,7 +1637,9 @@ bool gemm_and_bias(
  if (activation == GEMMAndBiasActivationEpilogue::RELU) {
    epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
  } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
 #if CUDA_VERSION >= 11040 || defined(USE_ROCM)
    epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
 #endif
  }
  if (bias != nullptr) {
@ -1929,6 +1931,7 @@ void scaled_gemm(
    bool use_fast_accum) {
  // Note: see `cublasCommonArgs` for various non-intuitive manupulations
  // of input arguments to this function.
 #if CUDA_VERSION >= 11080 || defined(USE_ROCM)
  const auto computeType = CUBLAS_COMPUTE_32F;
  const auto scaleType = CUDA_R_32F;
  const float alpha_val = 1.0;
@ -2130,6 +2133,8 @@ void scaled_gemm(
      " scaleType ",
      scaleType);
  return;
 #endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM)
  TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
 }
 void int8_gemm(
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -281,9 +281,6 @@ bool CUDAHooks::compiledWithMIOpen() const {
 bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
 #if AT_CUDNN_ENABLED()
  if (!hasCUDA()) {
    return false;
  }
  // NOTE: extra parenthesis around numbers disable clang warnings about
  // dead code
  return true;
@ -294,9 +291,6 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
 bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const {
 #if AT_CUDNN_ENABLED()
  if (!hasCUDA()) {
    return false;
  }
  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
  // Check for Volta cores
  if (prop->major >= 7) {
@ -311,9 +305,6 @@ bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const {
 bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
 #if AT_CUDNN_ENABLED()
  if (!hasCUDA()) {
    return false;
  }
  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
  // Check for Volta cores
  if (prop->major >= 8) {
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@ -465,11 +465,8 @@ inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor
    return false;
  }
-  auto is_channel_last = [](const at::Tensor& t) {
+  auto fmt = input.suggest_memory_format();
-    auto fmt = t.suggest_memory_format();
+  return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
    return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
  };
  return is_channel_last(input) || is_channel_last(weight);
 }
 } // namespace at::native
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -32,6 +32,10 @@
 #include <ATen/native/mkldnn/Utils.h>
 #endif
 #ifdef USE_MPS
 #include <ATen/mps/MPSDevice.h>
 #endif
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
@ -406,23 +410,11 @@ struct ConvParams {
  // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
  // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
 #if !defined(C10_MOBILE)
-    if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
+    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
      return false;
    }
    static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
    // broken on cuDNN 9.8
    if (cudnn_version >= 90800) {
      if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
          (input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
          weight.dim() == 5) {
        for (int i = 2; i < weight.dim(); i++) {
          if (weight.size(i) != 1) {
            return false;
          }
        }
      }
    }
    if (needs_64bit_indexing_no_split(input, weight)) {
      static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
      if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
        TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
                        " if the V8 API is not enabled or before cuDNN version 9.3+."
@ -430,6 +422,9 @@ struct ConvParams {
        return false;
      }
    }
    if (!input.is_cuda() || !cudnn_enabled) {
      return false;
    }
    if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
      if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) {
        return false;
@ -448,19 +443,16 @@ struct ConvParams {
  // Use cudnn for FP16 depthwise convolutions
  bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
-    if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) {
+    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
      return false;
    }
    if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
      // always use cudnn_depthwise for channels_last format
      return true;
    }
    // native kernel doesn't support 64-bit non-splittable case
-    if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
+    if (cudnn_enabled && !(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
      // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
      if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
        if (cudnn_version < 0 || cudnn_version > 91000) {
          return false;
        }
      }
      if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
        TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
                        " if the V8 API is not enabled or before cuDNN version 9.3+."
@ -470,10 +462,6 @@ struct ConvParams {
        return true;
      }
    }
    if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
      // always use cudnn_depthwise for channels_last format
      return true;
    }
    if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
      bool kernel_cond =  (use_cudnn(input, weight) &&
                           input.scalar_type() == kHalf && // only for FP16
@ -1441,8 +1429,12 @@ static inline at::MemoryFormat determine_backend_memory_format(
      }
      break;
    case ConvBackend::Mps:
    case ConvBackend::MpsTranspose:
      if (mps_conv_use_channels_last(input, weight)) {
 #ifdef USE_MPS
        if (!mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS)) {
          break;
        }
 #endif
        backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast;
      }
      break;
--- a/aten/src/ATen/native/ConvolutionMM3d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM3d.cpp
@ -9,7 +9,6 @@
 #include <ATen/native/TransposeType.h>
 #include <ATen/native/Unfold3d.h>
 #include <c10/util/irange.h>
 #include <c10/util/safe_numerics.h>
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -175,23 +174,6 @@ static inline void slow_conv3d_shape_check(
  const int64_t input_height = input.size(dim_height);
  const int64_t input_width = input.size(dim_width);
  constexpr int64_t MAX_SAFE_PAD = (1LL << 61);
  TORCH_CHECK_VALUE(
    pad_height <= MAX_SAFE_PAD,
    "Padding height too large: pad_height=",
    pad_height);
  TORCH_CHECK_VALUE(
    pad_width <= MAX_SAFE_PAD,
    "Padding width too large: pad_width=",
    pad_width);
  TORCH_CHECK_VALUE(
    pad_depth <= MAX_SAFE_PAD,
    "Padding depth too large: pad_depth=",
    pad_depth);
  const int64_t exact_input_depth = input_depth + 2 * pad_depth;
  const int64_t exact_input_height = input_height + 2 * pad_height;
  const int64_t exact_input_width = input_width + 2 * pad_width;
@ -239,14 +221,6 @@ static inline void slow_conv3d_shape_check(
      output_width,
      "). Output size is too small");
  uint64_t kernel_product;
  TORCH_CHECK(
    !c10::mul_overflows(kernel_height, kernel_width, &kernel_product),
    "Kernel height x width product is too large: kernel_height=",
    kernel_height,
    ", kernel_width=",
    kernel_width);
  if (weight.defined()) {
    int64_t n_input_plane = weight.size(1);
    if (weight.dim() == 2) {
--- a/aten/src/ATen/native/Fill.cpp
+++ b/aten/src/ATen/native/Fill.cpp
@ -97,38 +97,43 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
  int64_t nDims = self.dim();
  TORCH_CHECK(nDims >= 2, "dimensions must larger than 1");
-  auto height = self.sym_size(0);
+  int64_t height = self.size(0);
-  auto width = self.sym_size(1);
+  int64_t width = self.size(1);
  if (nDims > 2) {
    int64_t dim1 = height;
    for (const auto i : c10::irange(1, nDims)) {
-      if (self.sym_size(i) != height) {
+      if (self.size(i) != dim1) {
        TORCH_CHECK(false, "all dimensions of input must be of equal length");
      }
    }
  }
-  auto storage_offset = self.sym_storage_offset();
+  int64_t storage_offset = self.storage_offset();
-  auto size = std::min(height, width);
+  std::vector<int64_t> sizes;
  std::vector<int64_t> strides;
  int64_t size = std::min(height, width);
  int64_t stride = 0;
  for (const auto i : c10::irange(nDims)) {
    stride += self.stride(i);
  }
-  std::vector<SymInt> strides{stride};
+  strides.push_back(stride);
-  std::vector<SymInt> sizes{size};
+  sizes.push_back(size);
-  auto main_diag = self.as_strided_symint(sizes, strides, storage_offset);
+  auto main_diag = self.as_strided(sizes, strides, storage_offset);
  main_diag.fill_(fill_value);
  if (wrap && nDims == 2 && height > width + 1) {
-    auto step = width + 1;
+    std::vector<int64_t> wrap_sizes;
    auto wrap_size = ((self.numel() + step - 1) / step) - size;
    std::vector<SymInt> wrap_sizes{wrap_size};
-    auto offset = self.stride(0) * (width + 1);
+    int64_t step = width + 1;
    int64_t wrap_size = ((self.numel() + step - 1) / step) - size;
    wrap_sizes.push_back(wrap_size);
-    auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset);
+    int64_t offset = self.stride(0) * (width + 1);
    auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset);
    wrap_diag.fill_(fill_value);
  }
--- a/aten/src/ATen/native/Histogram.cpp
+++ b/aten/src/ATen/native/Histogram.cpp
@ -23,7 +23,6 @@
 #include <ATen/ops/linspace.h>
 #endif
 #include <cmath>
 #include <numeric>
 #include <tuple>
 #include <vector>
@ -203,46 +202,6 @@ select_outer_bin_edges(const Tensor& input, std::optional<c10::ArrayRef<double>>
    return std::make_pair(leftmost_edges, rightmost_edges);
 }
 /* Bin edges correction based on the precision representation.
 * To maintain the backward compatibility we take max(std::nextafter<>, +1)
 * and min(std::nextafter<>, -1) for scalar types. For other types +/- 1 as usual.
 */
 void bins_edges_correction(const ScalarType& t, double &leftmost_edge, double &rightmost_edge)
 {
 #define UPDATE_WITH_LIMIT(real_type, scalartype) \
  case ScalarType::scalartype:                   \
    leftmost_edge = std::min(                    \
        static_cast<double>(                     \
            std::nexttoward(                     \
                static_cast<real_type>(leftmost_edge),   \
                std::numeric_limits<real_type>::lowest() \
            )                                    \
        ),                                       \
        leftmost_edge - 1.                       \
    );                                           \
    rightmost_edge = std::max(                   \
        static_cast<double>(                     \
            std::nexttoward(                     \
                static_cast<real_type>(rightmost_edge), \
                std::numeric_limits<real_type>::max()   \
            )                                    \
        ),                                       \
        rightmost_edge + 1.                      \
    );                                           \
    break;
    switch (t) {
        UPDATE_WITH_LIMIT(double, Double)
        UPDATE_WITH_LIMIT(float, Float)
        default:
            // Fallback to the default behavior for other types
            leftmost_edge -= 1;
            rightmost_edge += 1;
    }
 #undef UPDATE_WITH_LIMIT
 }
 /* histc's version of the logic for outermost bin edges.
 */
 std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input,
@ -257,7 +216,8 @@ std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input,
    }
    if (leftmost_edge == rightmost_edge) {
-        bins_edges_correction(input.dtype().toScalarType(), leftmost_edge, rightmost_edge);
+        leftmost_edge -= 1;
        rightmost_edge += 1;
    }
    TORCH_CHECK(!(std::isinf(leftmost_edge) || std::isinf(rightmost_edge) ||
--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@ -23,6 +23,8 @@ Tensor& max_unpooling2d_forward_out_cpu(
  // Nondeterministic with duplicate indices
  at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out");
  auto oheight = output_size[0];
  auto owidth = output_size[1];
  TORCH_CHECK(
      indices_.scalar_type() == at::ScalarType::Long,
      "elements in indices should be type int64 but got: ", indices_.scalar_type());
@ -43,9 +45,6 @@ Tensor& max_unpooling2d_forward_out_cpu(
                self_.sizes(), " with dimension ", i , " being empty.");
  }
  auto oheight = output_size[0];
  auto owidth = output_size[1];
  auto memory_format = self_.suggest_memory_format();
  auto self = self_.contiguous(memory_format);
  auto indices = indices_.contiguous(memory_format);
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -1,5 +1,3 @@
 #include <ATen/core/ATen_fwd.h>
 #include <c10/core/ScalarType.h>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
@ -1880,18 +1878,19 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
  Tensor xtensor = self.expand(padded_size);
-  Tensor urtensor;
+  Tensor result;
  if (self.is_quantized()) {
-    urtensor = at::empty_quantized(target_size, self);
+    result = at::empty_quantized(target_size, self);
  } else {
-    urtensor = at::empty(target_size, self.options());
+    result = at::empty(target_size, self.options());
  }
  // return an empty tensor if one of the repeat dimensions is zero
  if (zero_tensor) {
-    return urtensor;
+    return result;
  }
  Tensor urtensor = at::alias(result);
  for (const auto i : c10::irange(xtensor.dim())) {
    // can't unfold with step 0, so make sure step is at least 1
    // (it doesn't matter what it is in that case, because the size is 0).
@ -1901,22 +1900,7 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
  urtensor.copy_(xtensor.expand_as(urtensor));
-  // Combine the dimensions to produce the target_size.
+  return result;
  // xtensor dims: [a0, ..., ad-1]
  // urtensor dims: [a0, ..., ad-1, b0, ..., bd-1]
  // b dims are produced by unfold.
  // Transform urtensor to [a0 * b0, ..., ad-1 * bd-1]
  const int64_t n_dims = xtensor.dim();
  auto range_a = at::arange(xtensor.dim(), at::TensorOptions(at::kLong));
  auto range_b = range_a + n_dims;
  auto stacked = stack({std::move(range_a), std::move(range_b)}, 1).flatten();
  auto permutation = IntArrayRef(stacked.data_ptr<int64_t>(), n_dims * 2);
  // Permute from [a0, ..., ad-1, b0, ..., bd-1] to [a0, b0, ..., ad-1, bd-1]
  urtensor = urtensor.permute(permutation);
  // Reshape from [a0, b0, ..., ad-1, bd-1] to [a0 * b0, ..., ad-1 * bd-1]
  urtensor = urtensor.reshape(target_size);
  return urtensor;
 }
 Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) {
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@ -999,41 +999,12 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
      dtypes[i] = iter.dtype(i);
    }
    auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
 #ifdef USE_ROCM
    constexpr int grp_sz = 128;
    launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
      if (unrl) {
        auto offsets0 = offset_calc.get(idx);
        auto offsets1 = offset_calc.get(idx + grp_sz);
        auto offsets2 = offset_calc.get(idx + grp_sz * 2);
        auto offsets3 = offset_calc.get(idx + grp_sz * 3);
        void* out0 = data[0] + offsets0[0];
        void* out1 = data[0] + offsets1[0];
        void* out2 = data[0] + offsets2[0];
        void* out3 = data[0] + offsets3[0];
        arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1);
        arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1);
        arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1);
        arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1);
        c10::cast_and_store<arg0_t>(dtypes[0], out0, result0);
        c10::cast_and_store<arg0_t>(dtypes[0], out1, result1);
        c10::cast_and_store<arg0_t>(dtypes[0], out2, result2);
        c10::cast_and_store<arg0_t>(dtypes[0], out3, result3);
      } else {
        auto offsets = offset_calc.get(idx);
        void* out = data[0] + offsets[0];
        arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
        c10::cast_and_store<arg0_t>(dtypes[0], out, result);
      }
    });
 #else
    launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
      auto offsets = offset_calc.get(idx);
      void* out = data[0] + offsets[0];
      arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
      c10::cast_and_store<arg0_t>(dtypes[0], out, result);
    });
 #endif
  }
 }
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@ -42,19 +42,6 @@ void bfloat16_copy_kernel_cuda(TensorIteratorBase &iter) {
    });
 }
 #ifdef USE_ROCM
 void bfloat16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) {
    gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::BFloat16 value) {
        return static_cast<float>(value);
    });
 }
 void float16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) {
    gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::Half value) {
        return static_cast<float>(value);
    });
 }
 #endif
 void float8_copy_kernel_cuda(TensorIteratorBase &iter) {
  ScalarType dtype = iter.dtype(0);
  ScalarType other_dtype = iter.dtype(1);
@ -200,17 +187,7 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) {
     } else {
       float16_copy_kernel_cuda(iter);
     }
-  }
+  } else if (isBitsType(dtype)) {
 #ifdef USE_ROCM
  else if ((iter.dtype(1) == kBFloat16 || iter.dtype(1) == kHalf) && dtype == kFloat) {
    if (iter.dtype(1) == kBFloat16) {
      bfloat16tofloat32_copy_kernel_cuda(iter);
    } else {
      float16tofloat32_copy_kernel_cuda(iter);
    }
  }
 #endif
  else if (isBitsType(dtype)) {
    TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting "
      "bits types to different bits types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype);
    AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] {
--- a/aten/src/ATen/native/cuda/MaxUnpooling.cu
+++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu
@ -125,6 +125,8 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
  TORCH_CHECK(
      indices_.scalar_type() == at::ScalarType::Long,
      "elements in indices should be type int64 but got: ", indices_.scalar_type());
  auto oheight = output_size[0];
  auto owidth = output_size[1];
  TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2},
      indices_arg{indices_, "indices_", 3};
@ -147,9 +149,6 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
      output_size.size() == 2,
      "There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements.");
  auto oheight = output_size[0];
  auto owidth = output_size[1];
  int64_t dimw = 2;
  int64_t dimh = 1;
  int64_t numBatch = 1;
@ -218,6 +217,9 @@ static void max_unpooling3d_shape_check(
    IntArrayRef stride,
    IntArrayRef padding,
    const char *fn_name) {
  int64_t oT = output_size[0];
  int64_t oH = output_size[1];
  int64_t oW = output_size[2];
  TORCH_CHECK(
      indices.scalar_type() == at::ScalarType::Long,
      "elements in indices should be type int64 but got: ", indices.scalar_type());
@ -248,10 +250,6 @@ static void max_unpooling3d_shape_check(
      "strides should be greater than zero, but got stride: ",
      stride);
  int64_t oT = output_size[0];
  int64_t oH = output_size[1];
  int64_t oW = output_size[2];
  int dimw = 3;
  int dimh = 2;
  int dimt = 1;
@ -404,6 +402,8 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
    const Tensor& indices_,
    IntArrayRef output_size,
    Tensor& grad_input) {
  int64_t oheight = output_size[0];
  int64_t owidth = output_size[1];
  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
  TORCH_CHECK(
      indices_.scalar_type() == at::ScalarType::Long,
@ -426,9 +426,6 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
  TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size());
  int64_t oheight = output_size[0];
  int64_t owidth = output_size[1];
  int64_t nInputCols, nInputRows, nInputPlane;
  int dimw = 2;
@ -508,14 +505,13 @@ at::Tensor& max_unpooling3d_backward_out_cuda(const Tensor& grad_output_,
    IntArrayRef padding,
    Tensor& grad_input) {
  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
  max_unpooling3d_shape_check(
    self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");
  int64_t oT = output_size[0];
  int64_t oH = output_size[1];
  int64_t oW = output_size[2];
  max_unpooling3d_shape_check(
    self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");
  int batchSize = 0;
  int inputSlices = 0;
  int inputTime = 0;
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@ -300,6 +300,8 @@ void nonzero_static_cuda_out_impl(
    int64_t size,
    int64_t fill_value,
    Tensor& out) {
 #if defined(CUDA_VERSION) || defined(USE_ROCM)
  Tensor self_contiguous_ = self.contiguous();
  // see comment in nonzero_cuda_out_impl on reqs for out
  bool out_correct_size =
@ -375,6 +377,9 @@ void nonzero_static_cuda_out_impl(
  if (need_to_copy) {
    out.copy_(out_temp);
  }
 #else
  TORCH_CHECK(false, "Nonzero_static is not supported for cuda <= 11.4");
 #endif
 }
 Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) {
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@ -221,9 +221,22 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
  std::optional<CuFFTConfig> uncached_plan;
  const CuFFTConfig * config = nullptr;
  // Workaround for gh-63152, gh-58724
  // Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used
  // Bluestein's algorithm is only used when a size has large prime factors,
  // sizes with only small prime factors can still be cached
-  if (plan_cache.max_size() > 0) {
+  bool use_caching = true;
 #ifdef CUFFT_VERSION
  if constexpr (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) {
    // Only cache plans for transforms with small prime factors
    use_caching = std::none_of(
        signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) {
      return has_large_prime_factor(dim_size);
    });
  }
 #endif
  if (use_caching && plan_cache.max_size() > 0) {
    guard.lock();
    if (plan_cache.max_size() > 0) {  // check again after acquiring the lock
      config = &plan_cache.lookup(Params);
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@ -1238,7 +1238,7 @@ Tensor _cholesky_solve_helper_cuda_magma(const Tensor& self, const Tensor& A, bo
 // Todo: cusolverDn<T>potrsBatched only supports nrhs == 1 and does not have good performance.
 //     Batched cholesky_solve is dispatched to magma.
 Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upper) {
-#if defined(USE_LINALG_SOLVER)
+#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Cusolver:
@ -1352,7 +1352,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
 }
 static void cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) {
-#if defined(USE_LINALG_SOLVER)
+#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Cusolver:
@ -2709,7 +2709,7 @@ void linalg_lstsq_gels(const Tensor& A, const Tensor& B, const Tensor& /*infos*/
 }
 void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
-#if defined(USE_LINALG_SOLVER)
+#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Magma:
@ -2733,7 +2733,7 @@ void lstsq_kernel(const Tensor& a, Tensor& b, Tensor& /*rank*/, Tensor& /*singul
  // first handle the underdetermined case (m < n)
  // this case is not supported by MAGMA or cuBLAS
  if (m < n) {
-#if defined(USE_LINALG_SOLVER)
+#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
    linalg_lstsq_gels(a, b, infos);
 #else
    TORCH_CHECK(
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
@ -14,7 +14,7 @@ struct EmbeddingBagParams {
  ::c10::metal::array<idx_type_t, 2> output_strides;
  ::c10::metal::array<idx_type_t, 2> max_indices_strides;
-  idx_type_t per_sample_weights_stride;
+  idx_type_t per_sample_weights_strides;
  idx_type_t num_indices;
  idx_type_t num_bags;
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
@ -23,72 +23,54 @@ struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
 template <EmbeddingBagMode M, typename T>
 struct ReductionOp {
  inline opmath_t<T> operator()(
-      opmath_t<T> weight_val,
+      T weight_val,
      opmath_t<T> out_val,
-      bool is_first) {
+      uint32_t per_sample_weights_index,
-    return weight_val + out_val;
+      constant T* per_sample_weights,
      uint32_t per_sample_weights_strides);
 };
 template <typename T>
 struct ReductionOp<EmbeddingBagMode::SUM, T> {
  inline opmath_t<T> operator()(
      T weight_val,
      opmath_t<T> out_val,
      uint32_t per_sample_weights_index,
      constant T* per_sample_weights,
      uint32_t per_sample_weights_strides) {
    if (per_sample_weights_strides) {
      T per_sample_weight = per_sample_weights
          [per_sample_weights_strides * per_sample_weights_index];
      return static_cast<opmath_t<T>>(per_sample_weight) *
          static_cast<opmath_t<T>>(weight_val) +
          out_val;
    } else {
      return static_cast<opmath_t<T>>(weight_val) + out_val;
    }
  }
 };
 template <typename T>
 struct ReductionOp<EmbeddingBagMode::MEAN, T> {
  inline opmath_t<T> operator()(
      T weight_val,
      opmath_t<T> out_val,
      uint32_t,
      constant T*,
      uint32_t) {
    return static_cast<opmath_t<T>>(weight_val) + out_val;
  }
 };
 template <typename T>
 struct ReductionOp<EmbeddingBagMode::MAX, T> {
  inline opmath_t<T> operator()(
-      opmath_t<T> weight_val,
+      T weight_val,
      opmath_t<T> out_val,
-      bool is_first) {
+      uint32_t,
-    return (is_first || weight_val > out_val) ? weight_val : out_val;
+      constant T*,
-  }
+      uint32_t) {
-};
+    return max(static_cast<opmath_t<T>>(weight_val), out_val);
 template <EmbeddingBagMode M, typename T>
 struct MaybeApplyPerSampleWeight {
  inline opmath_t<T> operator()(
      opmath_t<T> weight_val,
      uint32_t per_sample_weights_index,
      constant T* per_sample_weights,
      uint32_t per_sample_weights_stride) {
    return weight_val;
  }
 };
 template <typename T>
 struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
  inline opmath_t<T> operator()(
      opmath_t<T> weight_val,
      uint32_t per_sample_weights_index,
      constant T* per_sample_weights,
      uint32_t per_sample_weights_stride) {
    if (per_sample_weights_stride) {
      T per_sample_weight = per_sample_weights
          [per_sample_weights_stride * per_sample_weights_index];
      return static_cast<opmath_t<T>>(per_sample_weight) * weight_val;
    } else {
      return weight_val;
    }
  }
 };
 template <EmbeddingBagMode M, typename T, typename I>
 struct MaybeCalcMaxIndex {
  inline void operator()(
      opmath_t<T> weight_val,
      opmath_t<T> out_val,
      bool is_first,
      thread I& max_idx,
      I weight_idx,
      bool pad) {}
 };
 template <typename T, typename I>
 struct MaybeCalcMaxIndex<EmbeddingBagMode::MAX, T, I> {
  inline void operator()(
      opmath_t<T> weight_val,
      opmath_t<T> out_val,
      bool is_first,
      thread I& max_idx,
      I weight_idx,
      bool pad) {
    max_idx = !pad && (is_first || weight_val > out_val) ? weight_idx : max_idx;
  }
 };
@ -114,30 +96,6 @@ struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
  }
 };
 template <EmbeddingBagMode M, typename I>
 struct MaybeWriteMaxIndex {
  inline void operator()(
      device I*,
      const constant ::c10::metal::array<uint32_t, 2>&,
      uint32_t,
      uint32_t,
      I) {}
 };
 template <typename I>
 struct MaybeWriteMaxIndex<EmbeddingBagMode::MAX, I> {
  inline void operator()(
      device I* max_indices,
      const constant ::c10::metal::array<uint32_t, 2>& max_indices_strides,
      uint32_t bag_idx,
      uint32_t feature_idx,
      I max_idx) {
    max_indices
        [bag_idx * max_indices_strides[0] +
         feature_idx * max_indices_strides[1]] = max_idx;
  }
 };
 template <EmbeddingBagMode M, typename T, typename I>
 void embedding_bag_impl(
    constant T* weight,
@ -154,7 +112,7 @@ void embedding_bag_impl(
  auto num_bags = params.num_bags;
  auto feature_size = params.feature_size;
  auto padding_idx = params.padding_idx;
-  auto per_sample_weights_stride = params.per_sample_weights_stride;
+  auto per_sample_weights_strides = params.per_sample_weights_strides;
  constant auto& output_strides = params.output_strides;
  constant auto& weight_strides = params.weight_strides;
  constant auto& max_indices_strides = params.max_indices_strides;
@ -162,6 +120,8 @@ void embedding_bag_impl(
  auto bag_idx = tid / feature_size;
  auto feature_idx = tid % feature_size;
  output += bag_idx * output_strides[0] + feature_idx * output_strides[1];
  uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
  bool is_last_bag = bag_idx + 1 == num_bags;
  uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
@ -171,37 +131,28 @@ void embedding_bag_impl(
  auto out_val = ReductionOpInit<M, T>()();
  uint32_t bag_size_ = 0;
  I max_idx = 0;
  for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
       indices_idx++) {
    I weight_idx = indices[indices_idx];
    bool pad = (weight_idx == padding_idx);
-    auto weight_val = static_cast<opmath_t<T>>(
+    T weight_val = weight
-        weight
+        [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
-            [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
+         feature_idx * weight_strides[1]];
             feature_idx * weight_strides[1]]);
    weight_val = MaybeApplyPerSampleWeight<M, T>()(
        weight_val, indices_idx, per_sample_weights, per_sample_weights_stride);
    auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0);
    MaybeCalcMaxIndex<M, T, I>()(
        weight_val, out_val, bag_size_ == 0, max_idx, weight_idx, pad);
    out_val = pad ? out_val : new_out_val;
    offset2bag[indices_idx] = bag_idx;
    bag_size_ += static_cast<uint32_t>(!pad);
    auto tmp_val = ReductionOp<M, T>()(
        weight_val,
        out_val,
        indices_idx,
        per_sample_weights,
        per_sample_weights_strides);
    out_val = pad ? out_val : tmp_val;
  }
-  output[bag_idx * output_strides[0] + feature_idx * output_strides[1]] =
+  *output = ReductionOpFinal<M, T>()(out_val, bag_size_);
      ReductionOpFinal<M, T>()(out_val, bag_size_);
  bag_size[bag_idx] = bag_size_;
  MaybeWriteMaxIndex<M, I>()(
      max_indices, max_indices_strides, bag_idx, feature_idx, max_idx);
 }
 #define DISPATCH_IMPL(MODE)        \
--- a/aten/src/ATen/native/mps/kernels/GridSampler.metal
+++ b/aten/src/ATen/native/mps/kernels/GridSampler.metal
@ -223,6 +223,9 @@ void grid_sampler_single_element(
    auto input_size = input_sizes[input_dim];
    auto coord = static_cast<opmath_t<T>>(coords[coord_dim]);
    // Interpret nan as -1
    coord = isnan(coord) ? -1 : coord;
    if (!align_corners) {
      // Map unaligned grid space to aligned grid space
      auto corner_alignment_factor = static_cast<opmath_t<T>>(input_size) /
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@ -52,7 +52,9 @@ static void fill_depthwise_conv_desc(MPSGraphDepthwiseConvolution3DOpDescriptor*
                                     NSUInteger dilationRateInX,
                                     NSUInteger dilationRateInY,
                                     NSUInteger paddingHorizontal,
-                                     NSUInteger paddingVertical) {
+                                     NSUInteger paddingVertical,
                                     c10::MemoryFormat memory_format,
                                     NSUInteger groups) {
  descriptor_.strides =
      @[ @1, [[NSNumber alloc] initWithInteger:strideInY], [[NSNumber alloc] initWithInteger:strideInX] ];
  descriptor_.dilationRates =
@ -101,7 +103,7 @@ static void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
  descriptor_.groups = groups;
 }
-static Tensor _mps_convolution_impl(const Tensor& input_t,
+static Tensor _mps_convolution_impl(const Tensor& input_t_,
                                    const Tensor& weight_t,
                                    const std::optional<Tensor>& bias_opt,
                                    IntArrayRef padding,
@ -109,15 +111,12 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
                                    IntArrayRef dilation,
                                    int64_t groups,
                                    std::optional<IntArrayRef> input_shape) {
-  constexpr auto kChannelsLast = MemoryFormat::ChannelsLast;
+  const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
-  constexpr auto kContiguous = MemoryFormat::Contiguous;
+  Tensor input_t = input_t_;
-  const bool is_macos_15_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
+  bool is3DConv = input_t.dim() == 5;
-
+  if (!is_macOS_15_0_or_newer || is3DConv) {
-  const bool is3DConv = input_t.dim() == 5;
+    input_t = input_t.contiguous();
-  const auto memory_format = input_t.suggest_memory_format();
+  }
  const auto input_suggested_layout = memory_format == kChannelsLast && is_macos_15_plus ? kChannelsLast : kContiguous;
  const bool is_channels_last = mps_conv_use_channels_last(input_t, weight_t) && !is3DConv;
  const bool bias_defined = bias_opt ? bias_opt->defined() : false;
  TORCH_CHECK(isFloatingType(input_t.scalar_type()), "Convolution is supported only for Floating types");
@ -127,6 +126,15 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
  checkAllSameType(c, {input, weight});
  checkAllSameGPU(c, {input, weight});
  bool bias_defined;
  if (bias_opt == std::nullopt)
    bias_defined = false;
  else
    bias_defined = bias_opt->defined();
  auto memory_format = input_t.suggest_memory_format();
  bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast) && !is3DConv;
  auto output_t =
      at::empty(input_shape.has_value() ? input_shape.value()
                                        : conv_output_size(input->sizes(), weight->sizes(), padding, stride, dilation),
@ -134,18 +142,12 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
                std::nullopt,
                kMPS,
                std::nullopt,
-                is_channels_last ? kChannelsLast : kContiguous);
+                is_macOS_15_0_or_newer ? memory_format : MemoryFormat::Contiguous);
  if (output_t.numel() == 0) {
    return output_t;
  }
  TensorArg output{output_t, "result", 0};
  // TODO: Remove me when MacOS-14 is no longer supported
  std::optional<Tensor> output_c;
  if (!is_macos_15_plus && is_channels_last) {
    output_c = at::empty_like(output_t, output_t.options().memory_format(kContiguous));
  }
  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_1_PLUS)) {
    // On macOS < 15.1, MPS convolution kernel does not support output channels > 2^16
    for (auto elem : output_t.sizes()) {
@ -184,22 +186,32 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
                                  getArrayRefString(dilation),
                                  getArrayRefString(padding),
                                  groups,
-                                  input_suggested_layout == kChannelsLast,
+                                  is_channels_last,
                                  mps::getTensorsStringKey({input_t, weight_t}),
                                  bias_defined,
                                  bias_shape_key);
-    auto inputShape = mps::getMPSShape(input_t, input_suggested_layout);
+    MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
-    auto outputShape = mps::getMPSShape(output_t, input_suggested_layout);
+    MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
-    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
+    MPSNDArray* inputNDArray = nil;
-      bool isDepthwiseConv =
+    MPSNDArray* outputNDArray = nil;
          (groups > 1 && weight_t.size(1) == 1) && input_t.dim() >= 4 && weight_t.dim() >= 4 && !is_channels_last;
-      auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(input_t), inputShape);
+    if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) {
-      auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
+      inputNDArray = getMPSNDArray(input_t, inputShape);
-      MPSGraphTensor* outputTensor = nil;
+      outputNDArray = getMPSNDArray(*output, outputShape);
    }
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSShape* weightShape = mps::getMPSShape(weight_t);
      bool isDepthwiseConv = ((groups > 1 && (weightShape[1].intValue == 1)) && inputShape.count >= 4 &&
                              weightShape.count >= 4 && !is_channels_last);
      MPSGraphTensor* inputTensor =
          mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(input_t.scalar_type()), inputShape);
      MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
      MPSGraphTensor* outputTensor;
      if (is3DConv) {
-        auto conv3dDescriptor_ = [[MPSGraphConvolution3DOpDescriptor new] autorelease];
+        MPSGraphConvolution3DOpDescriptor* conv3dDescriptor_ = [[MPSGraphConvolution3DOpDescriptor new] autorelease];
        fill_conv3d_desc(conv3dDescriptor_,
                         stride[2],
                         stride[1],
@ -217,9 +229,17 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
                                                    descriptor:conv3dDescriptor_
                                                          name:nil];
      } else if (isDepthwiseConv) {
-        auto depthWiseConv3dDescriptor_ = [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
+        MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
-        fill_depthwise_conv_desc(
+            [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
-            depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
+        fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
                                 stride[1],
                                 stride[0],
                                 dilation[1],
                                 dilation[0],
                                 padding[1],
                                 padding[0],
                                 memory_format,
                                 groups);
        MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor
                                                                dimension:-3
@ -238,7 +258,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
                       dilation[0],
                       padding[1],
                       padding[0],
-                       input_suggested_layout,
+                       memory_format,
                       groups);
        outputTensor = [mpsGraph convolution2DWithSourceTensor:inputTensor
@ -250,6 +270,13 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
      MPSGraphTensor* biasTensor = nil;
      if (bias_defined) {
        biasTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(bias_opt.value()));
      }
      if (is_channels_last && !is_macOS_15_0_or_newer) {
        outputTensor = mps::convertNHWCtoNCHW(mpsGraph, outputTensor);
      }
      if (bias_defined) {
        outputTensor = [mpsGraph additionWithPrimaryTensor:outputTensor secondaryTensor:biasTensor name:nil];
      }
      newCachedGraph->inputTensor_ = inputTensor;
@ -258,26 +285,27 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
      newCachedGraph->outputTensor_ = outputTensor;
    });
-    auto inputPlaceholder = input_suggested_layout == kContiguous
+    auto inputPlaceholder = inputNDArray ? Placeholder(cachedGraph->inputTensor_, inputNDArray)
-        ? Placeholder(cachedGraph->inputTensor_, output_c || is3DConv ? input_t.contiguous() : input_t)
+                                         : Placeholder(cachedGraph->inputTensor_, input_t, inputShape);
-        : Placeholder(cachedGraph->inputTensor_, getMPSNDArray(input_t, inputShape));
+    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
    auto outputPlaceholder = input_suggested_layout == kContiguous
        ? Placeholder(cachedGraph->outputTensor_, output_c ? *output_c : output_t)
        : Placeholder(cachedGraph->outputTensor_, getMPSNDArray(output_t, outputShape));
    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, output_c ? weight_t.contiguous() : weight_t);
    auto biasPlaceholder = Placeholder();
    // Reshape the bias to be broadcastable with output of conv2d or conv3d
    if (bias_defined) {
      if (is3DConv) {
-        biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, bias_shape[0], 1, 1, 1}));
+        biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1, 1}));
      } else if (input_suggested_layout == kChannelsLast) {
        biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, 1, 1, bias_shape[0]}));
      } else {
-        biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, bias_shape[0], 1, 1}));
+        if (is_channels_last && is_macOS_15_0_or_newer) {
          biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, 1, 1, bias_shape[0]}));
        } else {
          biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1}));
        }
      }
    }
    auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray)
                                           : Placeholder(cachedGraph->outputTensor_, *output);
-    auto feeds = [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
+    NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
        [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
    feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
    feeds[weightsPlaceholder.getMPSGraphTensor()] = weightsPlaceholder.getMPSGraphTensorData();
    if (bias_defined) {
@ -287,11 +315,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
  }
-  if (output_c) {
+  return *output;
    output_t.copy_(*output_c);
  }
  return output_t;
 }
 Tensor _mps_convolution(const Tensor& input_t,
@ -327,21 +351,14 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
  TensorArg grad_output{grad_output_t, "grad_output", 1}, weight{weight_t, "weight", 2};
  checkAllSameType(c, {grad_output, weight});
  checkAllSameGPU(c, {grad_output, weight});
-  constexpr auto kChannelsLast = at::MemoryFormat::ChannelsLast;
+  auto memory_format = grad_output_t.suggest_memory_format();
-  bool is_channels_last = mps_conv_use_channels_last(grad_output_t, weight_t) && !is3DConv;
+  bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast) && !is3DConv;
-  auto grad_input_t =
+  auto grad_input_t = at::empty(input_size, grad_output_t.options(), std::nullopt);
      at::empty(input_size, grad_output_t.options(), is_channels_last ? std::optional(kChannelsLast) : std::nullopt);
  // Avoid "grad_input" when this is being used as transposed convolution
  TensorArg grad_input{grad_input_t, "result", 0};
  convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
  // TODO: Remove me when MacOS-14 is no longer supported
  std::optional<Tensor> grad_input_c;
  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_channels_last) {
    grad_input_c = at::empty_like(grad_input_t, grad_input_t.options().memory_format(MemoryFormat::Contiguous));
  }
  // Derive from MPSCachedGraph
  struct CachedGraph : public MPSCachedGraph {
    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@ -353,6 +370,7 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
  // Add backward with input
  @autoreleasepool {
    MPSStream* stream = getCurrentMPSStream();
    MPSShape* mps_input_shape = getMPSShape(input_size);
    std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
                                  is3DConv ? "3d_" : "",
@ -393,8 +411,15 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
      } else if (isDepthwiseConv) {
        MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
            [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
-        fill_depthwise_conv_desc(
+        fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
-            depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
+                                 stride[1],
                                 stride[0],
                                 dilation[1],
                                 dilation[0],
                                 padding[1],
                                 padding[0],
                                 at::MemoryFormat::Contiguous,
                                 groups);
        MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor
                                                                dimension:-3
                                                            withDimension:-4
@ -429,18 +454,14 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
      newCachedGraph->gradInputTensor_ = gradInputTensor;
    });
-    auto gradOutputPlaceholder =
+    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
-        Placeholder(cachedGraph->gradOutputTensor_, grad_input_c ? grad_output_t.contiguous() : grad_output_t);
+    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
-    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, grad_input_c ? weight_t.contiguous() : weight_t);
+    auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
    auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input_c ? *grad_input_c : grad_input_t);
    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, weightsPlaceholder);
    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
  }
-  if (grad_input_c) {
+  return *grad_input;
    grad_input_t.copy_(*grad_input_c);
  }
  return grad_input_t;
 }
 static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
@ -453,11 +474,9 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
                                               bool bias_defined) {
  using namespace at::native::mps;
  using namespace mps;
-  const bool is3DConv = input_t.dim() == 5;
+  bool is3DConv = input_t.dim() == 5;
  TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types");
  CheckedFrom c = "mps_convolution_backward_weights";
  constexpr auto kChannelsLast = at::MemoryFormat::ChannelsLast;
  bool is_channels_last = mps_conv_use_channels_last(input_t, grad_output_t) && !is3DConv;
  // For uniformity with everything else, although it seems grad_weight
  // would be unambiguous too.
@ -468,8 +487,7 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
  checkAllSameGPU(c, {grad_output, input});
  auto grad_weight_t =
-      at::empty(weight_size, grad_output_t.options(), is_channels_last ? std::optional(kChannelsLast) : std::nullopt);
+      at::empty(weight_size, grad_output_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
  TensorArg grad_weight{grad_weight_t, "result", 0};
  convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
@ -482,23 +500,16 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
    MPSGraphTensor* gradWeightTensor_ = nil;
  };
  // TODO: Remove me when MacOS-14 is no longer supported
  std::optional<Tensor> grad_weight_c;
  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_channels_last) {
    grad_weight_c = at::empty_like(grad_weight_t, grad_weight_t.options().memory_format(MemoryFormat::Contiguous));
  }
  @autoreleasepool {
    MPSStream* stream = getCurrentMPSStream();
    MPSShape* mps_weight_shape = getMPSShape(weight_size);
-    std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}:{}",
+    std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
                                  is3DConv ? "3d_" : "",
                                  getArrayRefString(stride),
                                  getArrayRefString(dilation),
                                  getArrayRefString(padding),
                                  groups,
                                  is_channels_last,
                                  getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSShape* inputShape = getMPSShape(input_t);
@ -530,8 +541,15 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
      } else if (isDepthwiseConv) {
        MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
            [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
-        fill_depthwise_conv_desc(
+        fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
-            depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
+                                 stride[1],
                                 stride[0],
                                 dilation[1],
                                 dilation[0],
                                 padding[1],
                                 padding[0],
                                 at::MemoryFormat::Contiguous,
                                 groups);
        NSNumber* outputFeatChannelDim = mps_weight_shape[0];
        MPSShape* weightShapeTranspose = @[ @1, outputFeatChannelDim, mps_weight_shape[2], mps_weight_shape[3] ];
        MPSGraphTensor* gradWeightTensorTranspose =
@ -565,19 +583,14 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
      newCachedGraph->gradWeightTensor_ = gradWeightTensor;
    });
-    auto gradOutputPlaceholder =
+    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
-        Placeholder(cachedGraph->gradOutputTensor_, grad_weight_c ? grad_output_t.contiguous() : grad_output_t);
+    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
-    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, grad_weight_c ? input_t.contiguous() : input_t);
+    auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t);
    auto outputPlaceholder =
        Placeholder(cachedGraph->gradWeightTensor_, grad_weight_c ? *grad_weight_c : grad_weight_t);
    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder);
    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
  }
  if (grad_weight_c) {
    grad_weight_t.copy_(*grad_weight_c);
  }
  return grad_weight_t;
 }
--- a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
+++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
@ -66,12 +66,11 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
  int64_t num_indices = indices.size(0);
  int64_t num_bags = offsets.size(0);
  if (include_last_offset) {
    TORCH_CHECK(num_bags >= 1, "include_last_offset: number of offsets should be at least 1");
    num_bags -= 1;
  }
  int64_t feature_size = weight.size(1);
-  auto bag_size = at::empty({num_bags}, indices.options());
+  auto bag_size = at::empty(offsets.sizes(), indices.options());
  auto offset2bag = at::empty({indices.size(0)}, indices.options());
  auto output = at::empty({num_bags, feature_size}, weight.options());
@ -95,7 +94,7 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
  }
  bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
-  params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
+  params.per_sample_weights_strides = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
  params.num_indices = num_indices;
  params.num_bags = num_bags;
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@ -20,7 +20,6 @@
 #include <ATen/ops/baddbmm_native.h>
 #include <ATen/ops/bmm_native.h>
 #include <ATen/ops/cholesky_native.h>
 #include <ATen/ops/eye_native.h>
 #include <ATen/ops/linalg_cholesky_ex_native.h>
 #include <ATen/ops/linalg_inv_ex_native.h>
 #include <ATen/ops/linalg_lu_factor_ex_native.h>
@ -497,24 +496,26 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
  using namespace mps;
  TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
  TORCH_CHECK(!A.is_complex(), "linalg_inv: not supported for complex types yet!");
  using CachedGraph = MPSUnaryCachedGraph;
  MPSStream* stream = getCurrentMPSStream();
  info.zero_();
  if (A.numel() == 0) {
    return;
  }
  if (!result.is_contiguous()) {
    result.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
  }
  auto A_sizes = A.sizes();
  int ndim = A.dim();
-  Tensor LU = empty_like(A, MemoryFormat::Contiguous);
+  Tensor LU = empty_like(A);
-  Tensor identity = eye(A.size(-2), A.size(-1), A.scalar_type(), A.options().layout(), A.device()).expand_as(A);
+  Tensor identity = zeros_like(A);
  Tensor pivots = empty({A_sizes.begin(), A_sizes.end() - 1}, A.options().dtype(kInt));
-  // need to do this to keep the strides of the result tensor
+  (ndim == 2 ? identity.diagonal() : identity.diagonal(0, -2, -1)).fill_(1);
-  // mps's solve expects row major layout, while inductor
+  linalg_solve_out_mps_impl(A, identity, true, check_errors, result, LU, pivots, info);
  // expects result to be column major
  Tensor tmp = empty_like(A, MemoryFormat::Contiguous);
  linalg_solve_out_mps_impl(A, identity, true, check_errors, tmp, LU, pivots, info);
  result.copy_(tmp);
 }
 static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) {
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@ -519,13 +519,6 @@ static void max_unpool_out_mps_template(const Tensor& input,
                                        Tensor& output,
                                        const int32_t pooling_dims,
                                        const std::string& op_name) {
  TORCH_CHECK(output_size_.size() == static_cast<size_t>(pooling_dims),
              op_name,
              "There should be exactly ",
              pooling_dims,
              " elements but got ",
              output_size_.size());
  auto dims = input.dim();
  auto leading_dims = input.dim() - pooling_dims;
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@ -9,22 +9,11 @@
 #else
 #include <ATen/ops/_unique2.h>
 #include <ATen/ops/_unique2_native.h>
 #include <ATen/ops/arange.h>
 #include <ATen/ops/argsort.h>
 #include <ATen/ops/cat.h>
 #include <ATen/ops/cumsum.h>
 #include <ATen/ops/full.h>
 #include <ATen/ops/masked_select.h>
 #include <ATen/ops/nonzero.h>
 #include <ATen/ops/ones.h>
 #include <ATen/ops/ones_like.h>
 #include <ATen/ops/slice.h>
 #include <ATen/ops/unique_consecutive.h>
 #include <ATen/ops/unique_consecutive_native.h>
 #include <ATen/ops/unique_dim_consecutive.h>
 #include <ATen/ops/unique_dim_consecutive_native.h>
 #include <ATen/ops/unique_dim_native.h>
 #include <ATen/ops/zeros.h>
 #endif
 namespace at::native {
@ -316,85 +305,4 @@ std::tuple<Tensor, Tensor, Tensor> _unique2_mps(const Tensor& self,
  return _unique_impl_mps(self, return_inverse, return_counts, false, std::nullopt);
 }
 static Tensor lexsort_rows_perm_mps(const Tensor& mat_2d) {
  const auto rows = mat_2d.size(0), cols = mat_2d.size(1);
  if (rows <= 1 || cols == 0) {
    return arange(rows, mat_2d.options().dtype(kLong));
  }
  auto perm = arange(rows, mat_2d.options().dtype(kLong));
  for (auto c = cols - 1; c >= 0; --c) {
    auto keys = mat_2d.select(1, c).index_select(0, perm);
    const auto idx = argsort(keys, /*dim=*/0, /*descending=*/false);
    perm = perm.index_select(0, idx);
  }
  return perm;
 }
 static std::tuple<Tensor, Tensor, Tensor> unique_dim_sorted_mps_impl(const Tensor& self,
                                                                     int64_t dim,
                                                                     bool return_inverse,
                                                                     bool return_counts) {
  dim = maybe_wrap_dim(dim, self.dim());
  auto sizes = self.sizes().vec();
  auto num_zero_dims = std::count(sizes.begin(), sizes.end(), (int64_t)0);
  if (self.size(dim) == 0) {
    auto output = at::empty(sizes, self.options());
    auto inverse_indices = at::empty({0}, self.options().dtype(kLong));
    auto counts = at::empty({0}, self.options().dtype(kLong));
    return {output, inverse_indices, counts};
  }
  auto transposed = self.moveaxis(dim, 0);
  auto orig_sizes = transposed.sizes().vec();
  auto rows = transposed.size(0);
  auto input_flat = transposed.contiguous().view({rows, -1});
  auto perm = lexsort_rows_perm_mps(input_flat);
  auto input_sorted = input_flat.index_select(0, perm);
  Tensor is_unique = at::zeros({rows}, self.options().dtype(kBool));
  if (rows > 0) {
    is_unique.narrow(0, 0, 1).fill_(true);
  }
  if (rows > 1) {
    auto a = input_sorted.narrow(0, 1, rows - 1);
    auto b = input_sorted.narrow(0, 0, rows - 1);
    auto row_changed = a.ne(b).any(1);
    is_unique.narrow(0, 1, rows - 1).copy_(row_changed);
  }
  auto unique_pos = nonzero(is_unique).squeeze(1);
  auto group_id = cumsum(is_unique.to(kLong), 0).sub(1);
  auto unique_rows_2d = input_sorted.index_select(0, unique_pos);
  Tensor inverse_indices = empty({0}, self.options().dtype(kLong));
  if (return_inverse) {
    inverse_indices = empty({rows}, self.options().dtype(kLong));
    inverse_indices.index_copy_(0, perm, group_id);
  }
  Tensor counts = empty({0}, self.options().dtype(kLong));
  if (return_counts) {
    const auto num_unique = unique_pos.size(0);
    counts = zeros({num_unique}, self.options().dtype(kLong));
    counts.scatter_add_(0, group_id, ones_like(group_id, group_id.options().dtype(kLong)));
  }
  orig_sizes[0] = unique_rows_2d.size(0);
  auto output = unique_rows_2d.view(orig_sizes).moveaxis(0, dim);
  return std::make_tuple(std::move(output), std::move(inverse_indices), std::move(counts));
 }
 std::tuple<Tensor, Tensor, Tensor> unique_dim_mps(const Tensor& self,
                                                  int64_t dim,
                                                  const bool /*sorted*/,
                                                  const bool return_inverse,
                                                  const bool return_counts) {
  return unique_dim_sorted_mps_impl(self, dim, return_inverse, return_counts);
 }
 } // namespace at::native
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -1409,7 +1409,7 @@
 - func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
  variants: function
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sparse_broadcast_to
+    SparseCPU, SparseCUDA: sparse_broadcast_to
 - func: cat(Tensor[] tensors, int dim=0) -> Tensor
  structured_delegate: cat.out
@ -3858,7 +3858,7 @@
  device_check: NoCheck   # TensorIterator
  structured: True
  dispatch:
-    CPU, CUDA, MTIA: aminmax_out
+    CPU, CUDA: aminmax_out
    MPS: aminmax_out_mps
 - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
@ -3909,7 +3909,7 @@
 - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
  dispatch:
-    CPU, CUDA, MTIA: amax_out
+    CPU, CUDA: amax_out
    MPS: amax_out_mps
 # Return: (Tensor output, Tensor indices)
@ -4090,7 +4090,7 @@
 - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
  dispatch:
-    CPU, CUDA, MTIA: amin_out
+    CPU, CUDA: amin_out
    MPS: amin_out_mps
 # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
@ -6450,7 +6450,6 @@
  dispatch:
    CPU: unique_dim_cpu
    CUDA: unique_dim_cuda
    MPS: unique_dim_mps
  tags: dynamic_output_shape
  autogen: unique_dim.out
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@ -158,46 +158,12 @@ c10::intrusive_ptr<EmbeddingPackedParamsBase> PackedEmbeddingBagWeight::prepack(
  return packed_ptr;
 }
 #ifdef USE_FBGEMM
 namespace {
 /// Number of columns in the rowwise min/max buffer passed to the quantization function(s)
 constexpr int kRowwiseMinMaxNumCols = 2;
 bool _validate_rowwise_min_max(
  const at::Tensor& weight,
  const std::optional<at::Tensor>& rowwise_min_max_opt) {
  const auto is_valid_rowwise_min_max = rowwise_min_max_opt.has_value();
  if (is_valid_rowwise_min_max) {
      TORCH_CHECK(
        (rowwise_min_max_opt->dim() == 2 &&
        rowwise_min_max_opt->size(0) == weight.size(0) &&
        rowwise_min_max_opt->size(1) == kRowwiseMinMaxNumCols),
        "'rowwise_min_max' must be a 2D tensor with shape [num_rows(weight), 2].");
  }
  return is_valid_rowwise_min_max;
 }
 auto _get_rowwise_min_max_contig(
  const std::optional<at::Tensor>& rowwise_min_max_opt) {
    return rowwise_min_max_opt.has_value()
      ? rowwise_min_max_opt->expect_contiguous(rowwise_min_max_opt->suggest_memory_format())
      : at::borrow_from_optional_tensor(rowwise_min_max_opt);
 }
 }
 #endif // USE_FBGEMM
 namespace at::native {
 // Note - This is a temporary pack function for embedding bag which quantizes
 // and packs the float weight tensor. In the next step it will be replaced by a
 // quantize and pack function once we support FP scale and FP zero_point
 //
 // The optional rowwise_min_max argument is to support callers to pass in the min/max
 // values of the weight tensor. If the rowwise_min_max is not provided, the min/max
 // values will be computed from the weight tensor.
 //
 // Python example examining a packed 8bit zero_point and scale:
 //
 // >> x = torch.from_numpy(np.array([[[10, 20], [30, 40]],[[50, 60], [70, 80]]],
@ -255,10 +221,7 @@ namespace at::native {
 //
 //        [[50.        , 60.00000035],
 //         [70.        , 80.00000035]]])
-Tensor& qembeddingbag_byte_prepack_out(
+Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
    Tensor& output,
    const Tensor& weight,
    const std::optional<Tensor>& rowwise_min_max_opt) {
  // The "last" dimension of an N-Dimensioned batch of embedding bags is
  // quantization channel. E.g. for a 2D embedding bag, this has
  // [ row, col ] dimensions, for batched of embedding bags, dimensions might be
@ -293,16 +256,9 @@ Tensor& qembeddingbag_byte_prepack_out(
  auto* output_data = output.data_ptr<uint8_t>();
 #ifdef USE_FBGEMM
  // Move these outside of the ifdef when we support non-FBGEMM flow.
  const auto is_valid_rowwise_min_max = _validate_rowwise_min_max(weight, rowwise_min_max_opt);
  const auto rowwise_min_max_contig = _get_rowwise_min_max_contig(rowwise_min_max_opt);
  if (weight_contig->scalar_type() == at::ScalarType::Half) {
    const auto weight_data =
        static_cast<fbgemm::float16*>(weight_contig->data_ptr());
    const auto rowwise_min_max_data = is_valid_rowwise_min_max
        ? static_cast<fbgemm::float16*>(rowwise_min_max_contig->data_ptr())
        : nullptr;
    at::parallel_for(
        0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
          fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<
@ -310,21 +266,17 @@ Tensor& qembeddingbag_byte_prepack_out(
              weight_data + start_idx * embedding_cols,
              end_idx - start_idx,
              embedding_cols,
-              output_data + start_idx * output_columns,
+              output_data + start_idx * output_columns);
              (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
        });
  } else {
    const auto weight_data = weight_contig->data_ptr<float>();
    const auto rowwise_min_max_data =
        is_valid_rowwise_min_max ? rowwise_min_max_contig->data_ptr<float>() : nullptr;
    at::parallel_for(
        0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
          fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<float>(
              weight_data + start_idx * embedding_cols,
              end_idx - start_idx,
              embedding_cols,
-              output_data + start_idx * output_columns,
+              output_data + start_idx * output_columns);
              (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
        });
  }
@ -374,22 +326,6 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
  return output;
 }
 static Tensor qembeddingbag_byte_prepack_with_rowwise_min_max(
    const Tensor& weight,
    const Tensor& rowwise_min_max) {
  const auto weight_contig =
      weight.expect_contiguous(weight.suggest_memory_format());
  Tensor output = at::detail::empty_cpu(
      {0},
      at::kByte,
      weight_contig->layout(),
      weight_contig->device(),
      std::nullopt,
      std::nullopt);
  qembeddingbag_byte_prepack_out(output, weight, rowwise_min_max);
  return output;
 }
 Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
  const auto weight_contig =
      weight.expect_contiguous(weight.suggest_memory_format());
@ -399,7 +335,7 @@ Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
      "'embedding_bag_byte_prepack' only support float32 or float16.");
  const auto weight_sizes = weight.sym_sizes();
  const auto cols_dim = weight.ndimension() - 1;
-  const auto& embedding_cols = weight_sizes[cols_dim];
+  const auto embedding_cols = weight_sizes[cols_dim];
  // Add 8 bytes per column to store FP32 scale and zero_point per row.
  const auto output_columns = embedding_cols + 2 * sizeof(float);
@ -423,8 +359,7 @@ Tensor _qembeddingbag_nbit_prepack_helper(
    int bit_width,
    const bool optimized_qparams,
    const int64_t nbins,
-    const double ratio,
+    const double ratio) {
    const std::optional<Tensor>& rowwise_min_max_opt = std::nullopt) {
  TORCH_CHECK(
      weight.scalar_type() == at::ScalarType::Float ||
          weight.scalar_type() == at::ScalarType::Half,
@ -466,17 +401,10 @@ Tensor _qembeddingbag_nbit_prepack_helper(
  auto* output_data = output.data_ptr<uint8_t>();
 #ifdef USE_FBGEMM
  // Move these outside of the ifdef when we support non-FBGEMM flow.
  const auto is_valid_rowwise_min_max = _validate_rowwise_min_max(weight, rowwise_min_max_opt);
  const auto rowwise_min_max_contig = _get_rowwise_min_max_contig(rowwise_min_max_opt);
  if (!optimized_qparams) {
    if (weight_contig.scalar_type() == at::ScalarType::Half) {
      const auto weight_data =
          static_cast<fbgemm::float16*>(weight_contig.data_ptr());
      const auto rowwise_min_max_data = is_valid_rowwise_min_max
          ? static_cast<fbgemm::float16*>(rowwise_min_max_contig->data_ptr())
          : nullptr;
      at::parallel_for(
          0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
            fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<
@ -485,13 +413,10 @@ Tensor _qembeddingbag_nbit_prepack_helper(
                weight_data + start_idx * embedding_cols,
                end_idx - start_idx,
                static_cast<int>(embedding_cols),
-                output_data + start_idx * output_shape[1],
+                output_data + start_idx * output_shape[1]);
                (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
          });
    } else {
      const auto weight_data = weight_contig.data_ptr<float>();
      const auto rowwise_min_max_data =
          is_valid_rowwise_min_max ? rowwise_min_max_contig->data_ptr<float>() : nullptr;
      at::parallel_for(
          0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
            fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<float>(
@ -499,8 +424,7 @@ Tensor _qembeddingbag_nbit_prepack_helper(
                weight_data + start_idx * embedding_cols,
                end_idx - start_idx,
                static_cast<int>(embedding_cols),
-                output_data + start_idx * output_shape[1],
+                output_data + start_idx * output_shape[1]);
                (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
          });
    }
  } else {
@ -590,16 +514,6 @@ Tensor qembeddingbag_4bit_prepack(
      weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio);
 }
 Tensor qembeddingbag_4bit_prepack_with_rowwise_min_max(
    const Tensor& weight,
    const Tensor& rowwise_min_max,
    const bool optimized_qparams,
    const int64_t nbins,
    const double ratio) {
  return _qembeddingbag_nbit_prepack_helper(
      weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio, rowwise_min_max);
 }
 // Applies 2-bit row-wise quantization by determining the range
 // (maximum - minimum) and bias (minimum value) of each row in the input
 // matrix, and then scaling each element to an 2-bit number between 0 and
@ -617,16 +531,6 @@ Tensor qembeddingbag_2bit_prepack(
      weight, 2 /*bit_width*/, optimized_qparams, nbins, ratio);
 }
 Tensor qembeddingbag_2bit_prepack_with_rowwise_min_max(
    const Tensor& weight,
    const Tensor& rowwise_min_max,
    const bool optimized_qparams,
    const int64_t nbins,
    const double ratio) {
  return _qembeddingbag_nbit_prepack_helper(
      weight, 2 /*bit_width*/, optimized_qparams, nbins, ratio, rowwise_min_max);
 }
 class QEmbeddingPackWeights final {
 public:
  static c10::intrusive_ptr<EmbeddingPackedParamsBase> run(const at::Tensor& weight) {
@ -638,21 +542,12 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
  m.impl(
      TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack"),
      TORCH_FN(qembeddingbag_byte_prepack));
  m.impl(
      TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack_with_rowwise_min_max"),
      TORCH_FN(qembeddingbag_byte_prepack_with_rowwise_min_max));
  m.impl(
      TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack"),
      TORCH_FN(qembeddingbag_4bit_prepack));
  m.impl(
      TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack_with_rowwise_min_max"),
      TORCH_FN(qembeddingbag_4bit_prepack_with_rowwise_min_max));
  m.impl(
      TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack"),
      TORCH_FN(qembeddingbag_2bit_prepack));
  m.impl(
      TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack_with_rowwise_min_max"),
      TORCH_FN(qembeddingbag_2bit_prepack_with_rowwise_min_max));
 }
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
@ -3,10 +3,7 @@
 namespace at::native {
-Tensor& qembeddingbag_byte_prepack_out(
+Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight);
    Tensor& output,
    const Tensor& weight,
    const std::optional<Tensor>& rowwise_min_max_opt = std::nullopt);
 Tensor qembeddingbag_byte_prepack(const Tensor& weight);
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@ -121,12 +121,9 @@ TORCH_LIBRARY(quantized, m) {
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag});
--- a/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu
--- a/Show More
+++ b/Show More
`@ -1 +1 @@`
	`bbb06c0334a6772b92d24bde54956e675c8c6604`	`5ae38bdb0dc066c5823e34dc9797afb9de42c866`
`@ -1 +1 @@`
	`da63274d9f3d06ba5815b5c8786a7194923a0234`	`367a480bd3534edf27a8dac3c6f7ea8af9d1ed45`