Fixes

Update build_triton_whl.py for new devices
Update build-triton-wheel.yml
2025-11-19 10:04:58 +08:00 · 2025-11-17 11:43:15 +00:00 · 2025-11-14 10:55:45 +00:00 · 2025-11-12 12:33:53 +00:00 · 2025-11-11 16:36:28 +00:00 · 2025-11-11 13:31:25 +00:00
782 changed files with 7795 additions and 24881 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -188,7 +188,7 @@ case "$tag" in
    fi
    GCC_VERSION=11
    VISION=yes
-    ROCM_VERSION=7.1
+    ROCM_VERSION=7.0
    NINJA_VERSION=1.9.0
    TRITON=yes
    KATEX=yes
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-bfeb066872bc1e8b2d2bc0a3b295b99dd77206e7
+0add68262ab0a2e33b84524346cb27cbb2787356
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -60,16 +60,14 @@ EOF
        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
    fi

-    if [[ $(ver $ROCM_VERSION) -lt $(ver 7.1) ]]; then
-      # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5, removed in ROCm 7.1
-      # search for all unversioned packages
-      # if search fails it will abort this script; use true to avoid case where search fails
-      MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
-      if [[ "x${MIOPENHIPGFX}" = x ]]; then
-        echo "miopen-hip-gfx package not available" && exit 1
-      else
-        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
-      fi
+    # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
+    # search for all unversioned packages
+    # if search fails it will abort this script; use true to avoid case where search fails
+    MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
+    if [[ "x${MIOPENHIPGFX}" = x ]]; then
+      echo "miopen-hip-gfx package not available" && exit 1
+    else
+      DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
    fi

    # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -12,8 +12,8 @@ function do_install() {

    rocm_version_nodot=${rocm_version//./}

-    # https://github.com/icl-utk-edu/magma/pull/65
-    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
+    # post merge of https://github.com/icl-utk-edu/magma/pull/65
+    MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"

    rocm_dir="/opt/rocm"
--- a/.ci/magma-rocm/README.md
+++ b/.ci/magma-rocm/README.md
@ -30,6 +30,7 @@ into a tarball, with the following structure:
 More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the ROCm version.
 Outputted binaries should be in the `output` folder.

+
 ## Pushing

 Packages can be uploaded to an S3 bucket using:
--- a/.ci/onnx/common.sh
+++ b/.ci/onnx/common.sh
@ -21,87 +21,3 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
 fi

 mkdir -p "$pytest_reports_dir" || true
-
-##########################################
-# copied from .ci/pytorch/common_utils.sh
-##########################################
-
-function get_pinned_commit() {
-  cat .github/ci_commit_pins/"${1}".txt
-}
-
-function pip_install_whl() {
-  # This is used to install PyTorch and other build artifacts wheel locally
-  # without using any network connection
-
-  # Convert the input arguments into an array
-  local args=("$@")
-
-  # Check if the first argument contains multiple paths separated by spaces
-  if [[ "${args[0]}" == *" "* ]]; then
-    # Split the string by spaces into an array
-    IFS=' ' read -r -a paths <<< "${args[0]}"
-    # Loop through each path and install individually
-    for path in "${paths[@]}"; do
-      echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
-    done
-  else
-    # Loop through each argument and install individually
-    for path in "${args[@]}"; do
-      echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
-    done
-  fi
-}
-
-function pip_build_and_install() {
-  local build_target=$1
-  local wheel_dir=$2
-
-  local found_whl=0
-  for file in "${wheel_dir}"/*.whl
-  do
-    if [[ -f "${file}" ]]; then
-      found_whl=1
-      break
-    fi
-  done
-
-  # Build the wheel if it doesn't exist
-  if [ "${found_whl}" == "0" ]; then
-    python3 -m pip wheel \
-      --no-build-isolation \
-      --no-deps \
-      -w "${wheel_dir}" \
-      "${build_target}"
-  fi
-
-  for file in "${wheel_dir}"/*.whl
-  do
-    pip_install_whl "${file}"
-  done
-}
-
-function install_torchvision() {
-  local orig_preload
-  local commit
-  commit=$(get_pinned_commit vision)
-  orig_preload=${LD_PRELOAD}
-  if [ -n "${LD_PRELOAD}" ]; then
-    # Silence dlerror to work-around glibc ASAN bug, see https://sourceware.org/bugzilla/show_bug.cgi?id=27653#c9
-    echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
-    LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
-  fi
-
-  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
-    # Not sure if both are needed, but why not
-    export FORCE_CUDA=1
-    export WITH_CUDA=1
-  fi
-  pip_build_and_install "git+https://github.com/pytorch/vision.git@${commit}" dist/vision
-
-  if [ -n "${LD_PRELOAD}" ]; then
-    LD_PRELOAD=${orig_preload}
-  fi
-}
--- a/.ci/onnx/test.sh
+++ b/.ci/onnx/test.sh
@ -19,7 +19,7 @@ git config --global --add safe.directory /var/lib/jenkins/workspace

 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
  # TODO: This can be removed later once vision is also part of the Docker image
-  install_torchvision
+  pip install -q --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
  # JIT C++ extensions require ninja, so put it into PATH.
  export PATH="/var/lib/jenkins/.local/bin:$PATH"
  # NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -96,6 +96,7 @@ function pip_build_and_install() {
    python3 -m pip wheel \
      --no-build-isolation \
      --no-deps \
+      --no-use-pep517 \
      -w "${wheel_dir}" \
      "${build_target}"
  fi
@ -307,28 +308,6 @@ function install_torchao() {
  pip_build_and_install "git+https://github.com/pytorch/ao.git@${commit}" dist/ao
 }

-function install_flash_attn_cute() {
-  echo "Installing FlashAttention CuTe from GitHub..."
-  # Grab latest main til we have a pinned commit
-  local flash_attn_commit
-  flash_attn_commit=$(git ls-remote https://github.com/Dao-AILab/flash-attention.git HEAD | cut -f1)
-
-  # Clone the repo to a temporary directory
-  rm -rf flash-attention-build
-  git clone --depth 1 --recursive https://github.com/Dao-AILab/flash-attention.git flash-attention-build
-
-  pushd flash-attention-build
-  git checkout "${flash_attn_commit}"
-
-  # Install only the 'cute' sub-directory
-  pip_install -e flash_attn/cute/
-  popd
-
-  # remove the local repo
-  rm -rf flash-attention-build
-  echo "FlashAttention CuTe installation complete."
-}
-
 function print_sccache_stats() {
  echo 'PyTorch Build Statistics'
  sccache --show-stats
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -353,17 +353,6 @@ def test_linalg(device="cpu") -> None:
            torch.linalg.svd(A)


-def test_sdpa(device="cpu", dtype=torch.float16) -> None:
-    """Regression test for https://github.com/pytorch/pytorch/issues/167602
-    Without nvrtc_builtins on CuDNN-9.13 on CUDA-13 fails with ` No valid execution plans built.`
-    """
-    print(f"Testing SDPA on {device} using type {dtype}")
-    k, q, v = torch.rand(3, 1, 16, 77, 64, dtype=dtype, device=device).unbind(0)
-    attn = torch.rand(1, 1, 77, 77, dtype=dtype, device=device)
-    rc = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn)
-    assert rc.isnan().any().item() is False
-
-
 def smoke_test_compile(device: str = "cpu") -> None:
    supported_dtypes = [torch.float16, torch.float32, torch.float64]

@ -500,12 +489,10 @@ def main() -> None:
    smoke_test_conv2d()
    test_linalg()
    test_numpy()
-    test_sdpa()

    if is_cuda_system:
        test_linalg("cuda")
        test_cuda_gds_errors_captured()
-        test_sdpa("cuda")

    if options.package == "all":
        smoke_test_modules()
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -344,18 +344,8 @@ test_python_smoke() {
 }

 test_python_smoke_b200() {
-  # Targeted smoke tests for B200 including FlashAttention CuTe coverage
-  install_flash_attn_cute
-  time python test/run_test.py \
-    --include \
-      test_matmul_cuda \
-      test_scaled_matmul_cuda \
-      inductor/test_fp8 \
-      nn/attention/test_fa4 \
-      nn/attention/test_open_registry \
-      inductor/test_flex_flash \
-    $PYTHON_TEST_EXTRA_OPTION \
-    --upload-artifacts-while-running
+  # Targeted smoke tests for B200 - staged approach to avoid too many failures
+  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

@ -389,13 +379,6 @@ test_lazy_tensor_meta_reference_disabled() {
  export -n TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE
 }

-test_dynamo_core() {
-  time python test/run_test.py \
-    --include-dynamo-core-tests \
-    --verbose \
-    --upload-artifacts-while-running
-  assert_git_not_dirty
-}

 test_dynamo_wrapped_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
@ -1250,97 +1233,6 @@ test_custom_script_ops() {
  assert_git_not_dirty
 }

-test_libtorch_agnostic_targetting() {
-    echo "Testing libtorch_agnostic runs correctly on TORCH_TARGET_VERSION"
-
-    REPO_DIR=$(pwd)
-    WHEEL_DIR="${REPO_DIR}/test/cpp_extensions/.wheels"
-
-    # Build wheel with current PyTorch (this has TORCH_TARGET_VERSION 2_9_0)
-    echo "Building 2.9 extension wheel with current PyTorch..."
-    pushd test/cpp_extensions/libtorch_agnostic_2_9_extension
-    time python setup.py bdist_wheel
-
-    # Save the wheel
-    mkdir -p "$WHEEL_DIR"
-    cp dist/*.whl "$WHEEL_DIR/"
-    WHEEL_FILE=$(find "$WHEEL_DIR" -maxdepth 1 -name "*.whl" -type f | head -1)
-    echo "Built wheel: $(basename "$WHEEL_FILE")"
-    popd
-
-    # Create venv and install PyTorch 2.9
-    python -m venv venv_pytorch_2_9
-    # shellcheck disable=SC1091
-    . venv_pytorch_2_9/bin/activate
-
-    # Clear PYTHONPATH to avoid using the development PyTorch
-    echo "Clearing PYTHONPATH to use only venv packages..."
-    unset PYTHONPATH
-
-    # Upgrade pip to latest version
-    echo "Upgrading pip to latest version..."
-    pip install --upgrade pip
-    pip --version
-
-    echo "Installing PyTorch 2.9..."
-
-    # Install from release channel only
-    PYTORCH_VERSION="2.9.0"
-
-    # Extract CUDA version from BUILD_ENVIRONMENT (e.g., "cuda12.1" -> "cu121")
-    if [[ "$BUILD_ENVIRONMENT" =~ cuda([0-9]+)\.([0-9]+) ]]; then
-        CUDA_MAJOR="${BASH_REMATCH[1]}"
-        CUDA_MINOR="${BASH_REMATCH[2]}"
-        CUDA_VERSION="cu${CUDA_MAJOR}${CUDA_MINOR}"
-        echo "  Detected CUDA ${CUDA_MAJOR}.${CUDA_MINOR} from BUILD_ENVIRONMENT, using ${CUDA_VERSION}"
-    else
-        # Default to CPU build
-        CUDA_VERSION="cpu"
-        echo "  No CUDA detected in BUILD_ENVIRONMENT, using CPU build"
-    fi
-
-    if pip install torch=="${PYTORCH_VERSION}" --index-url https://download.pytorch.org/whl/${CUDA_VERSION}/; then
-        echo "Installed PyTorch ${PYTORCH_VERSION} from release channel (${CUDA_VERSION})"
-    else
-        echo "  FAILED to install PyTorch 2.9.0 from release channel"
-        echo "  URL: https://download.pytorch.org/whl/${CUDA_VERSION}/"
-        deactivate
-        rm -rf venv_pytorch_2_9
-        return 1
-    fi
-
-    INSTALLED_VERSION=$(python -c "import torch; print(torch.__version__)" 2>/dev/null || echo "unknown")
-    echo "  Installed version: $INSTALLED_VERSION"
-
-    # Install test dependencies
-    echo "Installing test dependencies..."
-    pip install expecttest numpy unittest-xml-reporting
-
-    # Install the pre-built wheel
-    echo ""
-    echo "Installing pre-built 2.9 extension wheel (built with PyTorch 2.10)..."
-    pip install "$WHEEL_FILE"
-    echo "Installed $(basename "$WHEEL_FILE") into PyTorch 2.9 environment"
-
-    # Run tests with PyTorch 2.9 runtime (2.10 tests will be skipped automatically)
-    echo ""
-    echo "Running tests with PyTorch 2.9 runtime (using wheel built on PyTorch 2.10)..."
-    if time python test/cpp_extensions/test_libtorch_agnostic.py -v; then
-        echo ""
-        echo "  Wheel built with current torch and TORCH_TARGET_VERSION 2_9_0 works with PyTorch 2.9 runtime!"
-    else
-        echo "targeting test failed"
-        deactivate
-        rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
-        return 1
-    fi
-
-    deactivate
-    rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
-
-    assert_git_not_dirty
-}
-
 test_jit_hooks() {
  echo "Testing jit hooks in cpp"
  HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
@ -1768,7 +1660,7 @@ test_operator_microbenchmark() {

  cd "${TEST_DIR}"/benchmarks/operator_benchmark

-  for OP_BENCHMARK_TESTS in optimizer; do
+  for OP_BENCHMARK_TESTS in matmul mm addmm bmm conv; do
    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
      --benchmark-name "PyTorch operator microbenchmark" --use-compile
@ -1778,22 +1670,6 @@ test_operator_microbenchmark() {
  done
 }

-test_attention_microbenchmark() {
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-  TEST_DIR=$(pwd)
-
-  # Install attention-gym dependency
-  echo "Installing attention-gym..."
-  python -m pip install git+https://github.com/meta-pytorch/attention-gym.git@main
-  pip show triton
-
-  cd "${TEST_DIR}"/benchmarks/transformer
-
-  $TASKSET python score_mod.py --config configs/config_basic.yaml \
-    --output-json-for-dashboard "${TEST_REPORTS_DIR}/attention_microbenchmark.json"
-}
-
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
  (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@ -1813,8 +1689,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]];
 elif [[ "${TEST_CONFIG}" == *backward* ]]; then
  test_forward_backward_compatibility
  # Do NOT add tests after bc check tests, see its comment.
-elif [[ "${TEST_CONFIG}" == *libtorch_agnostic_targetting* ]]; then
-  test_libtorch_agnostic_targetting
 elif [[ "${TEST_CONFIG}" == *xla* ]]; then
  install_torchvision
  build_xla
@ -1853,8 +1727,6 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
  fi
 elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
  test_operator_microbenchmark
-elif [[ "${TEST_CONFIG}" == *attention_microbenchmark* ]]; then
-  test_attention_microbenchmark
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@ -1914,8 +1786,6 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  test_inductor_shard "${SHARD_NUMBER}"
 elif [[ "${TEST_CONFIG}" == *einops* ]]; then
  test_einops
-elif [[ "${TEST_CONFIG}" == *dynamo_core* ]]; then
-  test_dynamo_core
 elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
  install_torchvision
  test_dynamo_wrapped_shard "${SHARD_NUMBER}"
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -63,7 +63,7 @@ self-hosted-runner:
    - linux.rocm.gpu.gfx942.1
    - linux.rocm.gpu.gfx942.2
    - linux.rocm.gpu.gfx942.4
-    - linux.rocm.gfx942.docker-cache
+    - rocm-docker
    # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
    - macos-m1-stable
    - macos-m1-14
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-ee1a1350eb37804b94334768f328144f058f14e9
+ad5816f0eee1c873df1b7d371c69f1f811a89387
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-2d82dc5caa336d179d9b46ac4a0fb8c43d84c5cc
+ccb801b88af136454798b945175c4c87e636ac33
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-94631807d22c09723dd006f7be5beb649d5f88d0
+e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -91,6 +91,13 @@
 "ciflow/trunk":
 - .ci/docker/ci_commit_pins/triton.txt

+"oncall: distributed":
+- torch/csrc/distributed/**
+- torch/distributed/**
+- torch/nn/parallel/**
+- test/distributed/**
+- torch/testing/_internal/distributed/**
+
 "release notes: distributed (checkpoint)":
 - torch/distributed/checkpoint/**
 - test/distributed/checkpoint/**
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -7,7 +7,6 @@ ciflow_push_tags:
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
- ciflow/dynamo
 - ciflow/h100
 - ciflow/h100-cutlass-backend
 - ciflow/h100-distributed
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -136,7 +136,7 @@ def main() -> None:
    parser = ArgumentParser("Build Triton binaries")
    parser.add_argument("--release", action="store_true")
    parser.add_argument(
-        "--device", type=str, default="cuda", choices=["cuda", "rocm", "xpu", "aarch64"]
+        "--device", type=str, default="cuda", choices=["cuda", "rocm-n", "rocm-n-1", "xpu", "aarch64"]
    )
    parser.add_argument("--py-version", type=str)
    parser.add_argument("--commit-hash", type=str)
@ -148,8 +148,13 @@ def main() -> None:
    if args.triton_version:
        triton_version = args.triton_version

+    # Normalize device name for rocm-n rocm-n-1 builds
+    device = args.device
+    if args.device.startswith("rocm"):
+        device = "rocm"
+        
    build_triton(
-        device=args.device,
+        device=device,
        commit_hash=(
            args.commit_hash if args.commit_hash else read_triton_pin(args.device)
        ),
--- a/.github/scripts/generate_pytorch_version.py
+++ b/.github/scripts/generate_pytorch_version.py
@ -50,7 +50,7 @@ def get_tag() -> str:

 def get_base_version() -> str:
    root = get_pytorch_root()
-    dirty_version = Path(root / "version.txt").read_text().strip()
+    dirty_version = open(root / "version.txt").read().strip()
    # Strips trailing a0 from version.txt, not too sure why it's there in the
    # first place
    return re.sub(LEGACY_BASE_VERSION_SUFFIX_PATTERN, "", dirty_version)
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -326,7 +326,7 @@ jobs:
          SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
          SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
-          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          DOCKER_IMAGE: ${{ inputs.docker-image }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
--- a/.github/workflows/attention_op_microbenchmark.yml
+++ b/.github/workflows/attention_op_microbenchmark.yml
@ -1,73 +0,0 @@
-name: attention_op_microbenchmark
-
-on:
-  push:
-    tags:
-      - ciflow/op-benchmark/*
-  workflow_dispatch:
-  schedule:
-    # Run at 06:00 UTC everyday
-    - cron: 0 7 * * *
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  attn-microbenchmark-build:
-    if: github.repository_owner == 'pytorch'
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: '8.0 9.0'
-      test-matrix: |
-        { include: [
-          { config: "attention_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
-          { config: "attention_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
-        ]}
-    secrets: inherit
-
-  attn-microbenchmark-test:
-    name: attn-microbenchmark-test
-    uses: ./.github/workflows/_linux-test.yml
-    needs: attn-microbenchmark-build
-    with:
-      timeout-minutes: 500
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image: ${{ needs.attn-microbenchmark-build.outputs.docker-image }}
-      test-matrix: ${{ needs.attn-microbenchmark-build.outputs.test-matrix }}
-    secrets: inherit
-
-  # B200 runner
-  opmicrobenchmark-build-b200:
-    if: github.repository_owner == 'pytorch'
-    name: opmicrobenchmark-build-b200
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: '10.0'
-      test-matrix: |
-        { include: [
-          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
-        ]}
-    secrets: inherit
-
-  opmicrobenchmark-test-b200:
-    name: opmicrobenchmark-test-b200
-    uses: ./.github/workflows/_linux-test.yml
-    needs: opmicrobenchmark-build-b200
-    with:
-      timeout-minutes: 500
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
-      docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
-      test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
-      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-    secrets: inherit
--- a/.github/workflows/b200-distributed.yml
+++ b/.github/workflows/b200-distributed.yml
@ -37,7 +37,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
--- a/.github/workflows/b200-symm-mem.yml
+++ b/.github/workflows/b200-symm-mem.yml
@ -37,7 +37,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -51,12 +51,15 @@ jobs:
      fail-fast: false
      matrix:
        py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
-        device: ["cuda", "rocm", "xpu", "aarch64"]
+        device: ["cuda", "rocm-n", "rocm-n-1", "xpu", "aarch64"]
        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
-          - device: "rocm"
+          - device: "rocm-n"
            rocm_version: "7.1"
            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
+          - device: "rocm-n-1"
+            rocm_version: "7.0"
+            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
          - device: "cuda"
            rocm_version: ""
            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
@ -68,7 +71,7 @@ jobs:
            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge"
    timeout-minutes: 40
    env:
-      DOCKER_IMAGE: ${{ matrix.device == 'rocm' && format('pytorch/manylinux2_28-builder:rocm{0}', matrix.rocm_version) || matrix.device == 'aarch64' && 'pytorch/manylinux2_28_aarch64-builder:cpu-aarch64' || matrix.docker-image }}
+      DOCKER_IMAGE: ${{ startsWith(matrix.device, 'rocm') && format('pytorch/manylinux2_28-builder:rocm{0}', matrix.rocm_version) || matrix.device == 'aarch64' && 'pytorch/manylinux2_28_aarch64-builder:cpu-aarch64' || matrix.docker-image }}
      PY_VERS: ${{ matrix.py_vers }}
      BUILD_DEVICE: ${{ matrix.device }}
      PLATFORM: 'manylinux_2_28_x86_64'
@ -153,7 +156,7 @@ jobs:
              docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U cmake --force-reinstall
          fi

-          if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "rocm" || "${{ matrix.device }}" == "aarch64" ) ]]; then
+          if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == rocm* || "${{ matrix.device }}" == "aarch64" ) ]]; then
            # With this install, it gets clang 16.0.6.
            docker exec -t "${container_name}" dnf install clang lld -y
            WITH_CLANG_LDD="--with-clang-ldd"
@ -171,7 +174,7 @@ jobs:

      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
        with:
-          name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }}-${{ env.PLATFORM }}
+          name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }}${{ matrix.rocm_version != '' && format('-{0}', matrix.rocm_version) || '' }}-${{ env.PLATFORM }}
          if-no-files-found: error
          path: ${{ runner.temp }}/artifacts/wheelhouse/*

--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -119,22 +119,6 @@ jobs:
        with:
          docker-image: ${{ steps.build-docker-image.outputs.docker-image }}

-      - name: Generate output
-        if: contains(matrix.docker-image-name, 'rocm')
-        id: generate_output
-        run: |
-          docker_image_name="${{ matrix.docker-image-name }}"
-          docker_image_tag="${{ steps.build-docker-image.outputs.docker-image }}"
-          echo "${docker_image_name}=${docker_image_tag}" >> docker-builds-output-${docker_image_name}.txt
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4.4.0
-        if: contains(matrix.docker-image-name, 'rocm')
-        with:
-          name: docker-builds-artifacts-${{ matrix.docker-image-name }}
-          retention-days: 14
-          path: ./docker-builds-output-${{ matrix.docker-image-name }}.txt
-
      - uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
        name: Push to https://ghcr.io/
        id: push-to-ghcr-io
--- a/.github/workflows/docker-cache-mi300.yml
+++ b/.github/workflows/docker-cache-mi300.yml
@ -0,0 +1,55 @@
+name: docker-cache-mi300
+
+on:
+  # run every 6 hours
+  schedule:
+    - cron: 0 0,6,12,18 * * *
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  docker-cache:
+    if: github.repository_owner == 'pytorch'
+    runs-on: rocm-docker
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          no-sudo: true
+
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        continue-on-error: false
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+          push: false
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Tar and upload to S3 bucket
+        run: |
+          sudo docker save -o ~/docker-data/pytorch/pytorch_docker_image.tar ${{ steps.calculate-docker-image.outputs.docker-image }}
+          sudo rclone copy -P --s3-upload-concurrency 64 --s3-chunk-size 200M --s3-upload-cutoff 300M ~/docker-data/pytorch/pytorch_docker_image.tar oci:pytorchbucket0002/pytorch_docker_image --progress
--- a/.github/workflows/docker-cache-rocm.yml
+++ b/.github/workflows/docker-cache-rocm.yml
@ -1,105 +0,0 @@
-name: docker-cache-rocm
-
-on:
-  workflow_run:
-    workflows: [docker-builds]
-    branches: [main, release]
-    types:
-      - completed
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-  actions: read
-
-jobs:
-  download-docker-builds-artifacts:
-    if: github.repository_owner == 'pytorch'
-    name: download-docker-builds-artifacts
-    runs-on: ubuntu-latest
-    outputs:
-      pytorch-linux-jammy-rocm-n-py3: ${{ steps.process-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}
-      pytorch-linux-noble-rocm-n-py3: ${{ steps.process-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}
-      pytorch-linux-jammy-rocm-n-py3-benchmarks: ${{ steps.process-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}
-    steps:
-      - name: Download artifacts
-        uses: actions/download-artifact@v4.1.7
-        with:
-          run-id: ${{ github.event.workflow_run.id }}
-          path: ./docker-builds-artifacts
-          merge-multiple: true
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Process artifacts
-        id: process-artifacts
-        run: |
-          ls -R ./docker-builds-artifacts
-          cat ./docker-builds-artifacts/*txt >> "${GITHUB_OUTPUT}"
-          cat "${GITHUB_OUTPUT}"
-
-  docker-cache:
-    if: github.repository_owner == 'pytorch'
-    needs: download-docker-builds-artifacts
-    strategy:
-      fail-fast: false
-      matrix:
-        runner: [linux.rocm.gfx942.docker-cache]
-        docker-image: [
-          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}",
-          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}",
-          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}"
-        ]
-    runs-on: "${{ matrix.runner }}"
-    steps:
-      - name: debug
-        run: |
-          JSON_STRINGIFIED="${{ toJSON(needs.download-docker-builds-artifacts.outputs) }}"
-          echo "Outputs of download-docker-builds-artifacts job: ${JSON_STRINGIFIED}"
-
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-
-      - name: Login to Amazon ECR
-        id: login-ecr
-        continue-on-error: false
-        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
-
-      - name: Generate ghrc.io tag
-        id: ghcr-io-tag
-        run: |
-            ecr_image="${{ matrix.docker-image }}"
-            ghcr_image="ghcr.io/pytorch/ci-image:${ecr_image##*:}"
-            echo "ghcr_image=${ghcr_image}" >> "$GITHUB_OUTPUT"
-
-      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.ghcr-io-tag.outputs.ghcr_image }}
-
-      - name: Save as tarball
-        run: |
-          docker_image_tag=${{ matrix.docker-image }}
-          docker_image_tag="${docker_image_tag#*:}" # Remove everything before and including first ":"
-          docker_image_tag="${docker_image_tag%-*}" # Remove everything after and including last "-"
-          ref_name=${{ github.event.workflow_run.head_branch }}
-          if [[ $ref_name =~ "release/" ]]; then
-            ref_suffix="release"
-          elif [[ $ref_name == "main" ]]; then
-            ref_suffix="main"
-          else
-            echo "Unexpected branch in ref_name: ${ref_name}" && exit 1
-          fi
-          docker tag ${{ steps.ghcr-io-tag.outputs.ghcr_image }} ${{ matrix.docker-image }}
-          # mv is atomic operation, so we use intermediate tar.tmp file to prevent read-write contention
-          docker save -o ~/pytorch-data/docker/${docker_image_tag}.tar.tmp ${{ matrix.docker-image }}
-          mv ~/pytorch-data/docker/${docker_image_tag}.tar.tmp ~/pytorch-data/docker/${docker_image_tag}_${ref_suffix}.tar
--- a/.github/workflows/dynamo-unittest.yml
+++ b/.github/workflows/dynamo-unittest.yml
@ -1,70 +0,0 @@
-# Workflow: Dynamo Unit Test
-# runs unit tests for dynamo.
-name: dynamo-unittest
-
-on:
-  push:
-    tags:
-      - ciflow/dynamo/*
-  workflow_call:
-  schedule:
-    - cron: 29 8 * * * # about 1:29am PDT
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-      opt_out_experiments: lf
-
-  dynamo-build:
-    name: dynamo-build
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    strategy:
-      matrix:
-        python-version: ['3.11', '3.12']
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py${{ matrix.python-version }}-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12
-      test-matrix: |
-        { include: [
-          { config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-        ]}
-    secrets: inherit
-
-  dynamo-test:
-    name: dynamo-test
-    uses: ./.github/workflows/_linux-test.yml
-    needs: [get-label-type, dynamo-build]
-    strategy:
-      matrix:
-        python-version: ['3.11', '3.12']
-    with:
-      build-environment: linux-jammy-py${{ matrix.python-version }}-clang12
-      docker-image: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12
-      test-matrix: |
-        { include: [
-          { config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-        ]}
-    secrets: inherit
--- a/.github/workflows/inductor-rocm-mi200.yml
+++ b/.github/workflows/inductor-rocm-mi200.yml
@ -1,4 +1,4 @@
-name: inductor-rocm-mi200
+name: inductor-rocm

 on:
  schedule:
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -70,7 +70,6 @@ jobs:
          { config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "numpy_2_x", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-          { config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}
    secrets: inherit

--- a/.github/workflows/rocm-mi200.yml
+++ b/.github/workflows/rocm-mi200.yml
@ -1,4 +1,4 @@
-name: rocm-mi200
+name: rocm

 on:
  push:
--- a/.github/workflows/test-b200.yml
+++ b/.github/workflows/test-b200.yml
@ -5,9 +5,7 @@
 # Flow:
 # 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
 # 2. Runs smoke tests on linux.dgx.b200 runner
-# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke_b200() function
-#    - Includes matmul, scaled_matmul, FP8, and FlashAttention CuTe tests
-#    - FlashAttention CuTe DSL is installed as part of test execution
+# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
 #
 # Triggered by:
 # - Pull requests modifying this workflow file
@ -54,7 +52,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
@ -75,4 +72,4 @@ jobs:
      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-    secrets: inherit
+    secrets: inherit
--- a/.github/workflows/trunk-rocm-mi300.yml
+++ b/.github/workflows/trunk-rocm-mi300.yml
@ -1,83 +0,0 @@
-name: trunk-rocm-mi300
-
-on:
-  push:
-    branches:
-      - main
-      - release/*
-  workflow_dispatch:
-  schedule:
-    - cron: 29 8 * * *  # about 1:29am PDT
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  llm-td:
-    if: github.repository_owner == 'pytorch'
-    name: before-test
-    uses: ./.github/workflows/llm_td_retrieval.yml
-    permissions:
-      id-token: write
-      contents: read
-
-  target-determination:
-    name: before-test
-    uses: ./.github/workflows/target_determination.yml
-    needs: llm-td
-    permissions:
-      id-token: write
-      contents: read
-
-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-rocm-py3_10-build:
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-      sync-tag: rocm-build
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
-          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
-          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
-          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
-          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
-          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-rocm-py3_10-test:
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-    secrets: inherit
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -83,7 +83,6 @@ jobs:
          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
-          { config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
        ]}
    secrets: inherit

--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -5,7 +5,6 @@ on:
    workflows:
      - pull
      - trunk
-      - trunk-rocm-mi300
      - periodic
      - periodic-rocm-mi200
      - periodic-rocm-mi300
--- a/.spin/cmds.py
+++ b/.spin/cmds.py
@ -1,330 +0,0 @@
-import hashlib
-import subprocess
-import sys
-from pathlib import Path
-
-import click
-import spin
-
-
-def file_digest(file, algorithm: str):
-    try:
-        return hashlib.file_digest(file, algorithm)
-    except AttributeError:
-        pass  # Fallback to manual implementation below
-    hash = hashlib.new(algorithm)
-    while chunk := file.read(8192):
-        hash.update(chunk)
-    return hash
-
-
-def _hash_file(file):
-    with open(file, "rb") as f:
-        hash = file_digest(f, "sha256")
-    return hash.hexdigest()
-
-
-def _hash_files(files):
-    hashes = {file: _hash_file(file) for file in files}
-    return hashes
-
-
-def _read_hashes(hash_file: Path):
-    if not hash_file.exists():
-        return {}
-    with hash_file.open("r") as f:
-        lines = f.readlines()
-    hashes = {}
-    for line in lines:
-        hash = line[:64]
-        file = line[66:].strip()
-        hashes[file] = hash
-    return hashes
-
-
-def _updated_hashes(hash_file, files_to_hash):
-    old_hashes = _read_hashes(hash_file)
-    new_hashes = _hash_files(files_to_hash)
-    if new_hashes != old_hashes:
-        return new_hashes
-    return None
-
-
-@click.command()
-def regenerate_version():
-    """Regenerate version.py."""
-    cmd = [
-        sys.executable,
-        "-m",
-        "tools.generate_torch_version",
-        "--is-debug=false",
-    ]
-    spin.util.run(cmd)
-
-
-TYPE_STUBS = [
-    (
-        "Pytorch type stubs",
-        Path(".lintbin/.pytorch-type-stubs.sha256"),
-        [
-            "aten/src/ATen/native/native_functions.yaml",
-            "aten/src/ATen/native/tags.yaml",
-            "tools/autograd/deprecated.yaml",
-        ],
-        [
-            sys.executable,
-            "-m",
-            "tools.pyi.gen_pyi",
-            "--native-functions-path",
-            "aten/src/ATen/native/native_functions.yaml",
-            "--tags-path",
-            "aten/src/ATen/native/tags.yaml",
-            "--deprecated-functions-path",
-            "tools/autograd/deprecated.yaml",
-        ],
-    ),
-    (
-        "Datapipes type stubs",
-        None,
-        [],
-        [
-            sys.executable,
-            "torch/utils/data/datapipes/gen_pyi.py",
-        ],
-    ),
-]
-
-
-@click.command()
-def regenerate_type_stubs():
-    """Regenerate type stubs."""
-    for name, hash_file, files_to_hash, cmd in TYPE_STUBS:
-        if hash_file:
-            if hashes := _updated_hashes(hash_file, files_to_hash):
-                click.echo(
-                    f"Changes detected in type stub files for {name}. Regenerating..."
-                )
-                spin.util.run(cmd)
-                hash_file.parent.mkdir(parents=True, exist_ok=True)
-                with hash_file.open("w") as f:
-                    for file, hash in hashes.items():
-                        f.write(f"{hash}  {file}\n")
-                click.echo("Type stubs and hashes updated.")
-            else:
-                click.echo(f"No changes detected in type stub files for {name}.")
-        else:
-            click.echo(f"No hash file for {name}. Regenerating...")
-            spin.util.run(cmd)
-            click.echo("Type stubs regenerated.")
-
-
-@click.command()
-def regenerate_clangtidy_files():
-    """Regenerate clang-tidy files."""
-    cmd = [
-        sys.executable,
-        "-m",
-        "tools.linter.clang_tidy.generate_build_files",
-    ]
-    spin.util.run(cmd)
-
-
-#: These linters are expected to need less than 3s cpu time total
-VERY_FAST_LINTERS = {
-    "ATEN_CPU_GPU_AGNOSTIC",
-    "BAZEL_LINTER",
-    "C10_NODISCARD",
-    "C10_UNUSED",
-    "CALL_ONCE",
-    "CMAKE_MINIMUM_REQUIRED",
-    "CONTEXT_DECORATOR",
-    "COPYRIGHT",
-    "CUBINCLUDE",
-    "DEPLOY_DETECTION",
-    "ERROR_PRONE_ISINSTANCE",
-    "EXEC",
-    "HEADER_ONLY_LINTER",
-    "IMPORT_LINTER",
-    "INCLUDE",
-    "LINTRUNNER_VERSION",
-    "MERGE_CONFLICTLESS_CSV",
-    "META_NO_CREATE_UNBACKED",
-    "NEWLINE",
-    "NOQA",
-    "NO_WORKFLOWS_ON_FORK",
-    "ONCE_FLAG",
-    "PYBIND11_INCLUDE",
-    "PYBIND11_SPECIALIZATION",
-    "PYPIDEP",
-    "PYPROJECT",
-    "RAWCUDA",
-    "RAWCUDADEVICE",
-    "ROOT_LOGGING",
-    "TABS",
-    "TESTOWNERS",
-    "TYPEIGNORE",
-    "TYPENOSKIP",
-    "WORKFLOWSYNC",
-}
-
-
-#: These linters are expected to take a few seconds, but less than 10s cpu time total
-FAST_LINTERS = {
-    "CMAKE",
-    "DOCSTRING_LINTER",
-    "GHA",
-    "NATIVEFUNCTIONS",
-    "RUFF",
-    "SET_LINTER",
-    "SHELLCHECK",
-    "SPACES",
-}
-
-
-#: These linters are expected to take more than 10s cpu time total;
-#: some need more than 1 hour.
-SLOW_LINTERS = {
-    "ACTIONLINT",
-    "CLANGFORMAT",
-    "CLANGTIDY",
-    "CODESPELL",
-    "FLAKE8",
-    "GB_REGISTRY",
-    "PYFMT",
-    "PYREFLY",
-    "TEST_DEVICE_BIAS",
-    "TEST_HAS_MAIN",
-}
-
-
-ALL_LINTERS = VERY_FAST_LINTERS | FAST_LINTERS | SLOW_LINTERS
-
-
-LINTRUNNER_CACHE_INFO = (
-    Path(".lintbin/.lintrunner.sha256"),
-    [
-        "requirements.txt",
-        "pyproject.toml",
-        ".lintrunner.toml",
-    ],
-)
-
-
-LINTRUNNER_BASE_CMD = [
-    "uvx",
-    "--python",
-    "3.10",
-    "lintrunner@0.12.7",
-]
-
-
-@click.command()
-def setup_lint():
-    """Set up lintrunner with current CI version."""
-    cmd = LINTRUNNER_BASE_CMD + ["init"]
-    subprocess.run(cmd, check=True, capture_output=True, text=True)
-
-
-def _check_linters():
-    cmd = LINTRUNNER_BASE_CMD + ["list"]
-    ret = spin.util.run(cmd, output=False, stderr=subprocess.PIPE)
-    linters = {l.strip() for l in ret.stdout.decode().strip().split("\n")[1:]}
-    unknown_linters = linters - ALL_LINTERS
-    missing_linters = ALL_LINTERS - linters
-    if unknown_linters:
-        click.secho(
-            f"Unknown linters found; please add them to the correct category "
-            f"in .spin/cmds.py: {', '.join(unknown_linters)}",
-            fg="yellow",
-        )
-    if missing_linters:
-        click.secho(
-            f"Missing linters found; please update the corresponding category "
-            f"in .spin/cmds.py: {', '.join(missing_linters)}",
-            fg="yellow",
-        )
-    return unknown_linters, missing_linters
-
-
-@spin.util.extend_command(
-    setup_lint,
-    doc=f"""
-        If configuration has changed, update lintrunner.
-
-        Compares the stored old hashes of configuration files with new ones and
-        performs setup via setup-lint if the hashes have changed.
-        Hashes are stored in {LINTRUNNER_CACHE_INFO[0]}; the following files are
-        considered: {", ".join(LINTRUNNER_CACHE_INFO[1])}.
-        """,
-)
-@click.pass_context
-def lazy_setup_lint(ctx, parent_callback, **kwargs):
-    if hashes := _updated_hashes(*LINTRUNNER_CACHE_INFO):
-        click.echo(
-            "Changes detected in lint configuration files. Setting up linting tools..."
-        )
-        parent_callback(**kwargs)
-        hash_file = LINTRUNNER_CACHE_INFO[0]
-        hash_file.parent.mkdir(parents=True, exist_ok=True)
-        with hash_file.open("w") as f:
-            for file, hash in hashes.items():
-                f.write(f"{hash}  {file}\n")
-        click.echo("Linting tools set up and hashes updated.")
-    else:
-        click.echo("No changes detected in lint configuration files. Skipping setup.")
-    click.echo("Regenerating version...")
-    ctx.invoke(regenerate_version)
-    click.echo("Regenerating type stubs...")
-    ctx.invoke(regenerate_type_stubs)
-    click.echo("Done.")
-    _check_linters()
-
-
-@click.command()
-@click.option("-a", "--apply-patches", is_flag=True)
-@click.pass_context
-def lint(ctx, apply_patches, **kwargs):
-    """Lint all files."""
-    ctx.invoke(lazy_setup_lint)
-    all_files_linters = VERY_FAST_LINTERS | FAST_LINTERS
-    changed_files_linters = SLOW_LINTERS
-    cmd = LINTRUNNER_BASE_CMD
-    if apply_patches:
-        cmd += ["--apply-patches"]
-    all_files_cmd = cmd + [
-        "--take",
-        ",".join(all_files_linters),
-        "--all-files",
-    ]
-    spin.util.run(all_files_cmd)
-    changed_files_cmd = cmd + [
-        "--take",
-        ",".join(changed_files_linters),
-    ]
-    spin.util.run(changed_files_cmd)
-
-
-@click.command()
-@click.pass_context
-def fixlint(ctx, **kwargs):
-    """Autofix all files."""
-    ctx.invoke(lint, apply_patches=True)
-
-
-@click.command()
-@click.option("-a", "--apply-patches", is_flag=True)
-@click.pass_context
-def quicklint(ctx, apply_patches, **kwargs):
-    """Lint changed files."""
-    ctx.invoke(lazy_setup_lint)
-    cmd = LINTRUNNER_BASE_CMD
-    if apply_patches:
-        cmd += ["--apply-patches"]
-    spin.util.run(cmd)
-
-
-@click.command()
-@click.pass_context
-def quickfix(ctx, **kwargs):
-    """Autofix changed files."""
-    ctx.invoke(quicklint, apply_patches=True)
--- a/SECURITY.md
+++ b/SECURITY.md
@ -18,8 +18,6 @@ Please report security issues using https://github.com/pytorch/pytorch/security/

 All reports submitted through the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.

-**Note on crashes and out of bounds access**: PyTorch is a computational framework that performs operations on behalf of the caller. Like many low-level libraries, PyTorch generally does not validate all inputs to every function—the responsibility for providing valid arguments lies with the calling code. While crashes and out of bounds memory access should be reported as bugs, they are generally not considered security vulnerabilities in PyTorch's threat model.
-
 Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:

 https://www.facebook.com/whitehat
--- a/aten/src/ATen/LegacyBatchedTensorImpl.h
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.h
@ -144,7 +144,7 @@ inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(BatchDimsRef bdims) {
 }

 inline std::ostream& operator<<(std::ostream& out, const BatchDim& bdim) {
-  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ')';
+  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ")";
  return out;
 }

--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@ -9,7 +9,7 @@ namespace indexing {
 const EllipsisIndexType Ellipsis = EllipsisIndexType();

 std::ostream& operator<<(std::ostream& stream, const Slice& slice) {
-  stream << slice.start() << ':' << slice.stop() << ':' << slice.step();
+  stream << slice.start() << ":" << slice.stop() << ":" << slice.step();
  return stream;
 }

@ -31,12 +31,12 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index)
 }

 std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices) {
-  stream << '(';
+  stream << "(";
  for (const auto i : c10::irange(tensor_indices.size())) {
    stream << tensor_indices[i];
    if (i < tensor_indices.size() - 1) stream << ", ";
  }
-  stream << ')';
+  stream << ")";
  return stream;
 }

--- a/aten/src/ATen/TensorNames.cpp
+++ b/aten/src/ATen/TensorNames.cpp
@ -113,7 +113,7 @@ void TensorNames::checkUnique(const char* op_name) const {
 std::ostream& operator<<(std::ostream& out, const TensorName& tensorname) {
  out << tensorname.name_ << " (index ";
  out << tensorname.origin_idx_ << " of ";
-  out << tensorname.origin_ << ')';
+  out << tensorname.origin_ << ")";
  return out;
 }

--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -13,9 +13,9 @@ std::ostream& operator<<(std::ostream & out, const TensorGeometryArg& t) {
  if (t.pos == 0) {
    // 0 is distinguished; it usually indicates 'self' or the return
    // tensor
-    out << '\'' << t.name << '\'';
+    out << "'" << t.name << "'";
  } else {
-    out << "argument #" << t.pos << " '" << t.name << '\'';
+    out << "argument #" << t.pos << " '" << t.name << "'";
  }
  return out;
 }
@ -154,7 +154,7 @@ void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
      oss << "Tensor for " << t2 << " is on CPU, ";
    }
    oss << "but expected " << ((!t1->is_cpu() && !t2->is_cpu()) ? "them" : "it")
-        << " to be on GPU (while checking arguments for " << c << ')';
+        << " to be on GPU (while checking arguments for " << c << ")";
    TORCH_CHECK(false, oss.str());
  }
  TORCH_CHECK(
@ -199,7 +199,7 @@ void checkScalarTypes(CheckedFrom c, const TensorArg& t,
        i++;
      }
      oss << "; but got " << t->toString()
-          << " instead (while checking arguments for " << c << ')';
+          << " instead (while checking arguments for " << c << ")";
      TORCH_CHECK(false, oss.str());
    }
 }
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@ -43,8 +43,8 @@ std::string get_mkldnn_version() {
    // https://github.com/intel/ideep/issues/29
    {
      const dnnl_version_t* ver = dnnl_version();
-      ss << "Intel(R) MKL-DNN v" << ver->major << '.' << ver->minor << '.' << ver->patch
-         << " (Git Hash " << ver->hash << ')';
+      ss << "Intel(R) MKL-DNN v" << ver->major << "." << ver->minor << "." << ver->patch
+         << " (Git Hash " << ver->hash << ")";
    }
  #else
    ss << "MKLDNN not found";
@ -81,7 +81,7 @@ std::string get_openmp_version() {
          break;
      }
      if (ver_str) {
-        ss << " (a.k.a. OpenMP " << ver_str << ')';
+        ss << " (a.k.a. OpenMP " << ver_str << ")";
      }
    }
  #else
@ -135,38 +135,38 @@ std::string show_config() {

 #if defined(__GNUC__)
  {
-    ss << "  - GCC " << __GNUC__ << '.' << __GNUC_MINOR__ << '\n';
+    ss << "  - GCC " << __GNUC__ << "." << __GNUC_MINOR__ << "\n";
  }
 #endif

 #if defined(__cplusplus)
  {
-    ss << "  - C++ Version: " << __cplusplus << '\n';
+    ss << "  - C++ Version: " << __cplusplus << "\n";
  }
 #endif

 #if defined(__clang_major__)
  {
-    ss << "  - clang " << __clang_major__ << '.' << __clang_minor__ << '.' << __clang_patchlevel__ << '\n';
+    ss << "  - clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__ << "\n";
  }
 #endif

 #if defined(_MSC_VER)
  {
-    ss << "  - MSVC " << _MSC_FULL_VER << '\n';
+    ss << "  - MSVC " << _MSC_FULL_VER << "\n";
  }
 #endif

 #if AT_MKL_ENABLED()
-  ss << "  - " << get_mkl_version() << '\n';
+  ss << "  - " << get_mkl_version() << "\n";
 #endif

 #if AT_MKLDNN_ENABLED()
-  ss << "  - " << get_mkldnn_version() << '\n';
+  ss << "  - " << get_mkldnn_version() << "\n";
 #endif

 #ifdef _OPENMP
-  ss << "  - " << get_openmp_version() << '\n';
+  ss << "  - " << get_openmp_version() << "\n";
 #endif

 #if AT_BUILD_WITH_LAPACK()
@ -183,7 +183,7 @@ std::string show_config() {
  ss << "  - Cross compiling on MacOSX\n";
 #endif

-  ss << "  - "<< used_cpu_capability() << '\n';
+  ss << "  - "<< used_cpu_capability() << "\n";

  if (hasCUDA()) {
    ss << detail::getCUDAHooks().showConfig();
@ -200,10 +200,10 @@ std::string show_config() {
  ss << "  - Build settings: ";
  for (const auto& pair : caffe2::GetBuildOptions()) {
    if (!pair.second.empty()) {
-      ss << pair.first << '=' << pair.second << ", ";
+      ss << pair.first << "=" << pair.second << ", ";
    }
  }
-  ss << '\n';
+  ss << "\n";

  // TODO: do HIP
  // TODO: do XLA
--- a/aten/src/ATen/code_template.h
+++ b/aten/src/ATen/code_template.h
@ -209,7 +209,7 @@ struct CodeTemplate {
  // to indent correctly in the context.
  void emitIndent(std::ostream& out, size_t indent) const {
    for ([[maybe_unused]] const auto i : c10::irange(indent)) {
-      out << ' ';
+      out << " ";
    }
  }
  void emitStringWithIndents(
--- a/aten/src/ATen/core/Dimname.cpp
+++ b/aten/src/ATen/core/Dimname.cpp
@ -10,7 +10,7 @@ std::ostream& operator<<(std::ostream& out, const Dimname& dimname) {
  if (dimname.type() == NameType::WILDCARD) {
    out << "None";
  } else {
-    out << '\'' << dimname.symbol().toUnqualString() << '\'';
+    out << "'" << dimname.symbol().toUnqualString() << "'";
  }
  return out;
 }
--- a/aten/src/ATen/core/Range.cpp
+++ b/aten/src/ATen/core/Range.cpp
@ -5,7 +5,7 @@
 namespace at {

 std::ostream& operator<<(std::ostream& out, const Range& range) {
-  out << "Range[" << range.begin << ", " << range.end << ']';
+  out << "Range[" << range.begin << ", " << range.end << "]";
  return out;
 }

--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -71,7 +71,7 @@ void TensorBase::enforce_invariants() {

 void TensorBase::print() const {
  if (defined()) {
-    std::cerr << '[' << toString() << ' ' << sizes() << ']' << '\n';
+    std::cerr << "[" << toString() << " " << sizes() << "]" << '\n';
  } else {
    std::cerr << "[UndefinedTensor]" << '\n';
  }
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@ -1,6 +1,5 @@
 #pragma once

-#include <torch/headeronly/core/TensorAccessor.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Deprecated.h>
@ -12,37 +11,252 @@

 namespace at {

-using torch::headeronly::DefaultPtrTraits;
+// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor
+// is used to enable the __restrict__ keyword/modifier for the data
+// passed to cuda.
+template <typename T>
+struct DefaultPtrTraits {
+  typedef T* PtrType;
+};
+
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  using torch::headeronly::RestrictPtrTraits;
+template <typename T>
+struct RestrictPtrTraits {
+  typedef T* __restrict__ PtrType;
+};
 #endif

+// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
+// For CUDA tensors it is used in device code (only). This means that we restrict ourselves
+// to functions and types available there (e.g. IntArrayRef isn't).
+
+// The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers.
 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-using TensorAccessorBase = torch::headeronly::detail::TensorAccessorBase<c10::IntArrayRef, T, N, PtrTraits, index_t>;
+class TensorAccessorBase {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;

+  C10_HOST_DEVICE TensorAccessorBase(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : data_(data_), sizes_(sizes_), strides_(strides_) {}
+  C10_HOST IntArrayRef sizes() const {
+    return IntArrayRef(sizes_,N);
+  }
+  C10_HOST IntArrayRef strides() const {
+    return IntArrayRef(strides_,N);
+  }
+  C10_HOST_DEVICE index_t stride(index_t i) const {
+    return strides_[i];
+  }
+  C10_HOST_DEVICE index_t size(index_t i) const {
+    return sizes_[i];
+  }
+  C10_HOST_DEVICE PtrType data() {
+    return data_;
+  }
+  C10_HOST_DEVICE const PtrType data() const {
+    return data_;
+  }
+protected:
+  PtrType data_;
+  const index_t* sizes_;
+  const index_t* strides_;
+};
+
+// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
+// `Tensor.accessor<T, N>()`.
+// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and only
+// indexing on the device uses `TensorAccessor`s.
 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-using TensorAccessor = torch::headeronly::detail::TensorAccessor<c10::IntArrayRef, T, N, PtrTraits, index_t>;
+class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;

-namespace detail {
+  C10_HOST_DEVICE TensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : TensorAccessorBase<T, N, PtrTraits, index_t>(data_,sizes_,strides_) {}

-template <size_t N, typename index_t>
-struct IndexBoundsCheck {
-    IndexBoundsCheck(index_t i) {
-      TORCH_CHECK_INDEX(
+  C10_HOST_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
+  }
+
+  C10_HOST_DEVICE const TensorAccessor<T, N-1, PtrTraits, index_t> operator[](index_t i) const {
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
+  }
+};
+
+template<typename T, template <typename U> class PtrTraits, typename index_t>
+class TensorAccessor<T,1,PtrTraits,index_t> : public TensorAccessorBase<T,1,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : TensorAccessorBase<T, 1, PtrTraits, index_t>(data_,sizes_,strides_) {}
+  C10_HOST_DEVICE T & operator[](index_t i) {
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    return this->data_[this->strides_[0]*i];
+  }
+  C10_HOST_DEVICE const T & operator[](index_t i) const {
+    return this->data_[this->strides_[0]*i];
+  }
+};
+
+
+// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on for CUDA `Tensor`s on the host
+// and as
+// In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host)
+// in order to transfer them on the device when calling kernels.
+// On the device, indexing of multidimensional tensors gives to `TensorAccessor`s.
+// Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
+// Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available
+// on the device, so those functions are host only.
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+class GenericPackedTensorAccessorBase {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST GenericPackedTensorAccessorBase(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : data_(data_) {
+    std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
+    std::copy(strides_, strides_ + N, std::begin(this->strides_));
+  }
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
+  C10_HOST GenericPackedTensorAccessorBase(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : data_(data_) {
+    for (const auto i : c10::irange(N)) {
+      this->sizes_[i] = sizes_[i];
+      this->strides_[i] = strides_[i];
+    }
+  }
+
+  C10_HOST_DEVICE index_t stride(index_t i) const {
+    return strides_[i];
+  }
+  C10_HOST_DEVICE index_t size(index_t i) const {
+    return sizes_[i];
+  }
+  C10_HOST_DEVICE PtrType data() {
+    return data_;
+  }
+  C10_HOST_DEVICE const PtrType data() const {
+    return data_;
+  }
+protected:
+  PtrType data_;
+  // NOLINTNEXTLINE(*c-arrays*)
+  index_t sizes_[N];
+  // NOLINTNEXTLINE(*c-arrays*)
+  index_t strides_[N];
+  C10_HOST void bounds_check_(index_t i) const {
+    TORCH_CHECK_INDEX(
        0 <= i && i < index_t{N},
        "Index ",
        i,
        " is not within bounds of a tensor of dimension ",
        N);
-    }
+  }
 };
-}  // namespace detail

 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-using GenericPackedTensorAccessorBase = torch::headeronly::detail::GenericPackedTensorAccessorBase<detail::IndexBoundsCheck<N, index_t>, T, N, PtrTraits, index_t>;
+class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase<T,N,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
+
+  C10_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
+    index_t* new_sizes = this->sizes_ + 1;
+    index_t* new_strides = this->strides_ + 1;
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
+  }
+
+  C10_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) const {
+    const index_t* new_sizes = this->sizes_ + 1;
+    const index_t* new_strides = this->strides_ + 1;
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
+  }
+
+  /// Returns a PackedTensorAccessor of the same dimension after transposing the
+  /// two dimensions given. Does not actually move elements; transposition is
+  /// made by permuting the size/stride arrays. If the dimensions are not valid,
+  /// asserts.
+  C10_HOST GenericPackedTensorAccessor<T, N, PtrTraits, index_t> transpose(
+      index_t dim1,
+      index_t dim2) const {
+    this->bounds_check_(dim1);
+    this->bounds_check_(dim2);
+    GenericPackedTensorAccessor<T, N, PtrTraits, index_t> result(
+        this->data_, this->sizes_, this->strides_);
+    std::swap(result.strides_[dim1], result.strides_[dim2]);
+    std::swap(result.sizes_[dim1], result.sizes_[dim2]);
+    return result;
+  }
+};
+
+template<typename T, template <typename U> class PtrTraits, typename index_t>
+class GenericPackedTensorAccessor<T,1,PtrTraits,index_t> : public GenericPackedTensorAccessorBase<T,1,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
+
+  C10_DEVICE T & operator[](index_t i) {
+    return this->data_[this->strides_[0] * i];
+  }
+  C10_DEVICE const T& operator[](index_t i) const {
+    return this->data_[this->strides_[0]*i];
+  }
+
+  // Same as in the general N-dimensional case, but note that in the
+  // 1-dimensional case the returned PackedTensorAccessor will always be an
+  // identical copy of the original
+  C10_HOST GenericPackedTensorAccessor<T, 1, PtrTraits, index_t> transpose(
+      index_t dim1,
+      index_t dim2) const {
+    this->bounds_check_(dim1);
+    this->bounds_check_(dim2);
+    return GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>(
+        this->data_, this->sizes_, this->strides_);
+  }
+};

-template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-using GenericPackedTensorAccessor = torch::headeronly::detail::GenericPackedTensorAccessor<TensorAccessor<T, N-1, PtrTraits, index_t>, detail::IndexBoundsCheck<N, index_t>, T, N, PtrTraits, index_t>;

 // Can't put this directly into the macro function args because of commas
 #define AT_X GenericPackedTensorAccessor<T, N, PtrTraits, index_t>
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -245,9 +245,6 @@ class TORCH_API TensorBase {
  size_t weak_use_count() const noexcept {
    return impl_.weak_use_count();
  }
-  bool is_uniquely_owned() const noexcept {
-    return impl_.is_uniquely_owned();
-  }

  std::string toString() const;

--- a/aten/src/ATen/core/Vitals.cpp
+++ b/aten/src/ATen/core/Vitals.cpp
@ -9,8 +9,8 @@ APIVitals VitalsAPI;

 std::ostream& operator<<(std::ostream& os, TorchVital const& tv) {
  for (const auto& m : tv.attrs) {
-    os << "[TORCH_VITAL] " << tv.name << '.' << m.first << "\t\t "
-       << m.second.value << '\n';
+    os << "[TORCH_VITAL] " << tv.name << "." << m.first << "\t\t "
+       << m.second.value << "\n";
  }
  return os;
 }
--- a/aten/src/ATen/core/alias_info.h
+++ b/aten/src/ATen/core/alias_info.h
@ -100,18 +100,18 @@ inline bool operator==(const AliasInfo& lhs, const AliasInfo& rhs) {

 // this does match the way things are represented in the schema
 inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
-  out << '(';
+  out << "(";
  bool first = true;
  for (const auto& set : aliasInfo.beforeSets()) {
    if (first) {
      first = false;
    } else {
-      out << '|';
+      out << "|";
    }
    out << set.toUnqualString();
  }
  if (aliasInfo.isWrite()) {
-    out << '!';
+    out << "!";
  }
  if (aliasInfo.beforeSets() != aliasInfo.afterSets()) {
    out << " -> ";
@ -120,12 +120,12 @@ inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
      if (first) {
        first = false;
      } else {
-        out << '|';
+        out << "|";
      }
      out << set.toUnqualString();
    }
  }
-  out << ')';
+  out << ")";
  return out;
 }
 } // namespace c10
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@ -198,7 +198,7 @@ inline void swap(Blob& lhs, Blob& rhs)  noexcept {
 }

 inline std::ostream& operator<<(std::ostream& out, const Blob& v) {
-  return out << "Blob[" << v.TypeName() << ']';
+  return out << "Blob[" << v.TypeName() << "]";
 }

 } // namespace caffe2
--- a/aten/src/ATen/core/class_type.cpp
+++ b/aten/src/ATen/core/class_type.cpp
@ -456,8 +456,8 @@ bool ClassType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
          *why_not << "Method on class '" << repr_str()
                   << "' (1) is not compatible with interface '"
                   << rhs.repr_str() << "' (2)\n"
-                   << "  (1) " << self_method->getSchema() << '\n'
-                   << "  (2) " << schema << '\n';
+                   << "  (1) " << self_method->getSchema() << "\n"
+                   << "  (2) " << schema << "\n";
        }
        return false;
      }
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@ -100,7 +100,7 @@ struct TORCH_API ClassType : public NamedType {
  std::string repr_str() const override {
    std::stringstream ss;
    ss << str()
-       << " (of Python compilation unit at: " << compilation_unit().get() << ')';
+       << " (of Python compilation unit at: " << compilation_unit().get() << ")";
    return ss.str();
  }

--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
@ -58,12 +58,12 @@ std::string DispatchKeyExtractor::dumpState() const {
  std::ostringstream oss;
  for (const auto i : c10::irange(c10::utils::bitset::NUM_BITS())) {
    if (dispatch_arg_indices_reverse_.get(i)) {
-      oss << '1';
+      oss << "1";
    } else {
-      oss << '0';
+      oss << "0";
    }
  }
-  oss << ' ' << nonFallthroughKeys_ << '\n';
+  oss << " " << nonFallthroughKeys_ << "\n";
  return oss.str();
 }

--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -69,8 +69,8 @@ private:

 void _print_dispatch_trace(const std::string& label, const std::string& op_name, const DispatchKeySet& dispatchKeySet) {
  auto nesting_value = dispatch_trace_nesting_value();
-  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << ' ';
-  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << ']' << std::endl;
+  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
+  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
 }
 } // namespace detail

--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@ -570,7 +570,7 @@ void OperatorEntry::checkInvariants() const {

 std::string OperatorEntry::listAllDispatchKeys() const {
  std::ostringstream str;
-  str << '[';
+  str << "[";

  bool has_kernels = false;
  for (auto k : allDispatchKeysInFullSet()) {
@ -584,7 +584,7 @@ std::string OperatorEntry::listAllDispatchKeys() const {
    str << k;
    has_kernels = true;
  }
-  str << ']';
+  str << "]";
  return str.str();
 }

@ -683,12 +683,12 @@ void OperatorEntry::setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> c
 // This WON'T report backend fallbacks.
 std::string OperatorEntry::dumpState() const {
  std::ostringstream oss;
-  oss << "name: " << name_ << '\n';
+  oss << "name: " << name_ << "\n";
  if (schema_) {
-    oss << "schema: " << schema_->schema << '\n';
-    oss << "debug: " << schema_->debug << '\n';
+    oss << "schema: " << schema_->schema << "\n";
+    oss << "debug: " << schema_->debug << "\n";
    oss << "alias analysis kind: " << toString(schema_->schema.aliasAnalysis())
-        << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << '\n';
+        << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << "\n";
  } else {
    oss << "schema: (none)\n";
  }
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@ -7,7 +7,7 @@
 namespace c10 {

 void FunctionSchema::dump() const {
-  std::cout << *this << '\n';
+  std::cout << *this << "\n";
 }

 const std::vector<Argument>& FunctionSchema::getCorrectList(SchemaArgType type) const {
@ -210,9 +210,9 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {

  out << schema.name();
  if (!schema.overload_name().empty()) {
-    out << '.' << schema.overload_name();
+    out << "." << schema.overload_name();
  }
-  out << '(';
+  out << "(";

  bool seen_kwarg_only = false;
  for (const auto i : c10::irange(schema.arguments().size())) {
@ -273,7 +273,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
  }

  if (need_paren) {
-    out << '(';
+    out << "(";
  }
  for (const auto i : c10::irange(returns.size())) {
    if (i > 0) {
@ -288,7 +288,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
    out << "...";
  }
  if (need_paren) {
-    out << ')';
+    out << ")";
  }
  return out;
 }
@ -471,7 +471,7 @@ bool FunctionSchema::isForwardCompatibleWith(
    if (!arguments().at(i).isForwardCompatibleWith(old.arguments().at(i))) {
      if (why_not) {
        why_not
-            << '\'' << arguments().at(i).name() << '\''
+            << "'" << arguments().at(i).name() << "'"
            << " is not forward compatible with the older version of the schema";
      }
      return false;
@ -511,7 +511,7 @@ bool FunctionSchema::isForwardCompatibleWith(
             .isForwardCompatibleWith(old.arguments().at(i))) {
      if (why_not) {
        why_not << "Out argument '"
-                << '\'' << arguments().at(i).name()
+                << "'" << arguments().at(i).name()
                << " is not FC with the older version of the schema";
      }
      return false;
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -571,7 +571,7 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
    if (arg.N()) {
        N = std::to_string(*arg.N());
    }
-    out << '[' << N << ']';
+    out << "[" << N << "]";
  } else {
    out << unopt_type->str();
  }
@ -582,15 +582,15 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
  }

  if (is_opt) {
-    out << '?';
+    out << "?";
  }

  if (!arg.name().empty()) {
-    out << ' ' << arg.name();
+    out << " " << arg.name();
  }

  if (arg.default_value()) {
-    out << '=';
+    out << "=";
    if ((type->kind() == c10::TypeKind::StringType ||
        unopt_type->kind() == c10::TypeKind::StringType) &&
        arg.default_value().value().isString()) {
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -66,7 +66,7 @@ bool operator==(const ivalue::Tuple& lhs, const ivalue::Tuple& rhs) {
 }

 std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
-  out << v.qualifiedClassName() << '.' << v.name();
+  out << v.qualifiedClassName() << "." << v.name();
  return out;
 }

@ -526,7 +526,7 @@ std::ostream& printMaybeAnnotatedList(
      !elementTypeCanBeInferredFromMembers(list_elem_type)) {
    out << "annotate(" << the_list.type<c10::Type>()->annotation_str() << ", ";
    printList(out, the_list.toListRef(), "[", "]", formatter);
-    out << ')';
+    out << ")";
    return out;
  } else {
    return printList(out, the_list.toListRef(), "[", "]", formatter);
@ -538,7 +538,7 @@ std::ostream& printDict(
    std::ostream& out,
    const Dict& v,
    const IValueFormatter& formatter) {
-  out << '{';
+  out << "{";

  bool first = true;
  for (const auto& pair : v) {
@ -552,7 +552,7 @@ std::ostream& printDict(
    first = false;
  }

-  out << '}';
+  out << "}";
  return out;
 }
 }
@ -565,8 +565,8 @@ static std::ostream& printMaybeAnnotatedDict(
  auto value_type = the_dict.type()->castRaw<DictType>()->getValueType();
  if (the_dict.toGenericDict().empty() ||
      !elementTypeCanBeInferredFromMembers(value_type)) {
-    out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ',';
-    printDict(out, the_dict.toGenericDict(), formatter) << ')';
+    out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ",";
+    printDict(out, the_dict.toGenericDict(), formatter) << ")";
  } else {
    return printDict(out, the_dict.toGenericDict(), formatter);
  }
@ -577,7 +577,7 @@ static std::ostream& printComplex(std::ostream & out, const IValue & v) {
  c10::complex<double> d = v.toComplexDouble();
  IValue real(d.real()), imag(std::abs(d.imag()));
  auto sign = d.imag() >= 0 ? '+' : '-';
-  return out << real << sign << imag << 'j';
+  return out << real << sign << imag << "j";
 }

 std::ostream& IValue::repr(
@ -605,9 +605,9 @@ std::ostream& IValue::repr(
        if (static_cast<double>(i) == d) {
          // -0.0 (signed zero) needs to be parsed as -0.
          if (i == 0 && std::signbit(d)) {
-            return out << '-' << i << '.';
+            return out << "-" << i << ".";
          }
-          return out << i << '.';
+          return out << i << ".";
        }
      }
      auto orig_prec = out.precision();
@ -643,20 +643,20 @@ std::ostream& IValue::repr(
      device_stream << v.toDevice();
      out << "torch.device(";
      c10::printQuotedString(out, device_stream.str());
-      return out << ')';
+      return out << ")";
    }
    case IValue::Tag::Generator: {
      auto generator = v.toGenerator();
      out << "torch.Generator(device=";
      c10::printQuotedString(out, generator.device().str());
-      out << ", seed=" << generator.current_seed() << ')';
+      out << ", seed=" << generator.current_seed() << ")";
      return out;
    }
    case IValue::Tag::GenericDict:
      return printMaybeAnnotatedDict(out, v, formatter);
    case IValue::Tag::Enum: {
      auto enum_holder = v.toEnumHolder();
-      return out << enum_holder->qualifiedClassName() << '.' <<
+      return out << enum_holder->qualifiedClassName() << "." <<
          enum_holder->name();
    }
    case IValue::Tag::Object: {
@ -801,7 +801,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
      if (c == FP_NORMAL || c == FP_ZERO) {
        int64_t i = static_cast<int64_t>(d);
        if (static_cast<double>(i) == d) {
-          return out << i << '.';
+          return out << i << ".";
        }
      }
      auto orig_prec = out.precision();
@ -852,7 +852,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
      return printDict(out, v.toGenericDict(), formatter);
    case IValue::Tag::PyObject: {
      auto py_obj = v.toPyObject();
-      return out << "<PyObject at" << py_obj << '>';
+      return out << "<PyObject at" << py_obj << ">";
    }
    case IValue::Tag::Generator:
      return out << "Generator";
@ -862,22 +862,22 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
      // TODO we should attempt to call __str__ if the object defines it.
      auto obj = v.toObject();
      // print this out the way python would do it
-      return out << '<' << obj->name() << " object at " << obj.get() << '>';
+      return out << "<" << obj->name() << " object at " << obj.get() << ">";
    }
    case IValue::Tag::Enum: {
      auto enum_holder = v.toEnumHolder();
-      return out << "Enum<" << enum_holder->unqualifiedClassName() << '.' <<
-          enum_holder->name() << '>';
+      return out << "Enum<" << enum_holder->unqualifiedClassName() << "." <<
+          enum_holder->name() << ">";
    }

  }
-  return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << '>';
+  return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << ">";
 }

 #undef TORCH_FORALL_TAGS

 void IValue::dump() const {
-  std::cout << *this << '\n';
+  std::cout << *this << "\n";
 }

 std::shared_ptr<ClassType> ivalue::Object::type() const {
@ -1050,7 +1050,7 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
      std::stringstream err;
      err << "Cannot serialize custom bound C++ class";
      if (auto qualname = type()->name()) {
-        err << ' ' << qualname->qualifiedName();
+        err << " " << qualname->qualifiedName();
      }
      err << ". Please define serialization methods via def_pickle() for "
            "this class.";
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@ -18,8 +18,6 @@
 #include <unordered_set>
 #include <utility>

-C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
-
 namespace torch {
 class TORCH_API CustomClassHolder : public c10::intrusive_ptr_target {};
 namespace jit {
@ -1632,6 +1630,4 @@ struct TORCH_API WeakOrStrongTypePtr {

 } // namespace c10

-C10_DIAGNOSTIC_POP()
-
 #include <ATen/core/ivalue_inl.h> // IWYU pragma: keep
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@ -29,8 +29,6 @@
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/irange.h>

-C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
-
 namespace torch {
 namespace jit {
 struct Function;
@ -2569,5 +2567,3 @@ TypePtr IValue::type() const {
 }

 } // namespace c10
-
-C10_DIAGNOSTIC_POP()
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -211,7 +211,7 @@ struct TORCH_API OptionalType : public UnionType {

  std::string str() const override {
    std::stringstream ss;
-    ss << getElementType()->str() << '?';
+    ss << getElementType()->str() << "?";
    return ss.str();
  }

@ -240,7 +240,7 @@ struct TORCH_API OptionalType : public UnionType {

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "Optional[" << getElementType()->annotation_str(printer) << ']';
+    ss << "Optional[" << getElementType()->annotation_str(printer) << "]";
    return ss.str();
  }
 };
@ -906,7 +906,7 @@ struct TORCH_API ListType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "List[" << getElementType()->annotation_str(printer) << ']';
+    ss << "List[" << getElementType()->annotation_str(printer) << "]";
    return ss.str();
  }
 };
@ -946,7 +946,7 @@ struct TORCH_API DictType : public SharedType {
  std::string str() const override {
    std::stringstream ss;
    ss << "Dict(" << getKeyType()->str() << ", " << getValueType()->str()
-       << ')';
+       << ")";
    return ss.str();
  }

@ -1018,7 +1018,7 @@ struct TORCH_API FutureType

  std::string str() const override {
    std::stringstream ss;
-    ss << "Future(" << getElementType()->str() << ')';
+    ss << "Future(" << getElementType()->str() << ")";
    return ss.str();
  }
  TypePtr createWithContained(
@ -1041,7 +1041,7 @@ struct TORCH_API FutureType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "Future[" << getElementType()->annotation_str(printer) << ']';
+    ss << "Future[" << getElementType()->annotation_str(printer) << "]";
    return ss.str();
  }
 };
@ -1060,7 +1060,7 @@ struct TORCH_API AwaitType

  std::string str() const override {
    std::stringstream ss;
-    ss << "Await(" << getElementType()->str() << ')';
+    ss << "Await(" << getElementType()->str() << ")";
    return ss.str();
  }
  TypePtr createWithContained(
@ -1083,7 +1083,7 @@ struct TORCH_API AwaitType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "Await[" << getElementType()->annotation_str(printer) << ']';
+    ss << "Await[" << getElementType()->annotation_str(printer) << "]";
    return ss.str();
  }
 };
@ -1102,7 +1102,7 @@ struct TORCH_API RRefType

  std::string str() const override {
    std::stringstream ss;
-    ss << "RRef(" << getElementType()->str() << ')';
+    ss << "RRef(" << getElementType()->str() << ")";
    return ss.str();
  }
  TypePtr createWithContained(
@ -1115,7 +1115,7 @@ struct TORCH_API RRefType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "RRef[" << getElementType()->annotation_str(printer) << ']';
+    ss << "RRef[" << getElementType()->annotation_str(printer) << "]";
    return ss.str();
  }
 };
--- a/aten/src/ATen/core/operator_name.cpp
+++ b/aten/src/ATen/core/operator_name.cpp
@ -11,7 +11,7 @@ std::string toString(const OperatorName& opName) {
 std::ostream& operator<<(std::ostream& os, const OperatorName& opName) {
  os << opName.name;
  if (!opName.overload_name.empty()) {
-    os << '.' << opName.overload_name;
+    os << "." << opName.overload_name;
  }
  return os;
 }
--- a/aten/src/ATen/core/tensor_type.cpp
+++ b/aten/src/ATen/core/tensor_type.cpp
@ -65,7 +65,7 @@ VaryingShape<T> VaryingShape<T>::merge(const VaryingShape<T>& other) const {

 template <typename T>
 std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
-  out << '(';
+  out << "(";
  if (!vs.size()) {
    out << "*)";
    return out;
@ -79,10 +79,10 @@ std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
    if (v.has_value()) {
      out << v.value();
    } else {
-      out << '*';
+      out << "*";
    }
  }
-  out << ')';
+  out << ")";
  return out;
 }

@ -105,7 +105,7 @@ std::ostream& operator<<(
  }
  auto sizes_opt = ss.sizes();

-  os << '(';
+  os << "(";
  for (size_t i = 0; i < rank_opt.value(); i++) {
    if (i > 0) {
      os << ", ";
@ -113,10 +113,10 @@ std::ostream& operator<<(
    if(sizes_opt.has_value() && sizes_opt.value()[i].is_static()) {
      os << sizes_opt.value()[i];
    } else {
-      os << '*';
+      os << "*";
    }
  }
-  os << ')';
+  os << ")";

  return os;
 }
@ -131,17 +131,17 @@ std::ostream& operator<<(std::ostream& os, const ShapeSymbol& s) {
 }

 std::ostream& operator<<(std::ostream& os, const Stride& s) {
-  os << '{';
+  os << "{";
  if (s.stride_index_.has_value()) {
    os << *s.stride_index_;
  } else {
-    os << '*';
+    os << "*";
  }
-  os << ':';
+  os << ":";
  if (s.stride_.has_value()) {
    os << *s.stride_;
  } else {
-    os << '*';
+    os << "*";
  }
  os << '}';
  return os;
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -67,7 +67,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
      bool has_valid_strides_info = ndim > 0 &&
          value->strides().isComplete() && value->strides().size() == ndim;

-      out << '(';
+      out << "(";
      size_t i = 0;
      bool symbolic = type_verbosity() == TypeVerbosity::Symbolic;
      for (i = 0; i < *ndim; ++i) {
@ -79,7 +79,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
        } else if (symbolic) {
          out << value->symbolic_sizes().at(i);
        } else {
-          out << '*';
+          out << "*";
        }
      }
      if (has_valid_strides_info &&
@ -91,7 +91,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
          }
          out << value->strides()[i].value();
        }
-        out << ']';
+        out << "]";
      }
      if (type_verbosity() >= TypeVerbosity::Full) {
        if (value->requiresGrad()) {
@ -107,12 +107,12 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
          out << "device=" << *value->device();
        }
      }
-      out << ')';
+      out << ")";
    } else {
      if (type_verbosity() >= TypeVerbosity::Full) {
        size_t i = 0;
        if (value->requiresGrad()) {
-          out << '('
+          out << "("
              << "requires_grad=" << *value->requiresGrad();
          i++;
        }
@ -120,7 +120,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
          out << ((i++ > 0) ? ", " : "(") << "device=" << *value->device();
        }
        if (i > 0) {
-          out << ')';
+          out << ")";
        }
      }
    }
@ -133,18 +133,18 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
    out << *prim << "[]";
  } else if (t.kind() == TypeKind::OptionalType) {
    auto prim = t.castRaw<OptionalType>()->getElementType();
-    out << *prim << '?';
+    out << *prim << "?";
  } else if(t.kind() == TypeKind::FutureType) {
    auto elem = t.castRaw<FutureType>()->getElementType();
-    out << "Future[" << *elem << ']';
+    out << "Future[" << *elem << "]";
  } else if(t.kind() == TypeKind::RRefType) {
    auto elem = t.castRaw<RRefType>()->getElementType();
-    out << "RRef[" << *elem << ']';
+    out << "RRef[" << *elem << "]";
  } else if(auto tup = t.cast<TupleType>()) {
    if (tup->schema()) {
      out << "NamedTuple";
    }
-    out << '(';
+    out << "(";
    for(size_t i = 0; i < tup->elements().size(); ++i) {
      if(i > 0)
        out << ", ";
@ -160,7 +160,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
        out << *(tup->elements()[i]);
      }
    }
-    out << ')';
+    out << ")";
  } else if (t.kind() == TypeKind::FunctionType) {
    out << "Function";
  } else {
@ -475,7 +475,7 @@ std::optional<TypePtr> unifyTypeList(
      why_not << "Could not unify type list since element " << i << " of type "
              << elements.at(i)->repr_str()
              << " did not match the types before it ("
-              << ret_type->repr_str() << ')';
+              << ret_type->repr_str() << ")";
      return std::nullopt;
    }
    ret_type = *maybe_unified;
@ -907,13 +907,13 @@ std::string TupleType::str() const {
    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
    ss << name()->qualifiedName();
  } else {
-    ss << '(';
+    ss << "(";
    for(size_t i = 0; i < elements().size(); ++i) {
      if(i > 0)
        ss << ", ";
      ss << elements()[i]->str();
    }
-    ss << ')';
+    ss << ")";
  }
  return ss.str();
 }
@ -1003,8 +1003,8 @@ bool InterfaceType::isSubTypeImpl(
          *why_not << "Method on interface '" << lhs.repr_str()
                   << "' (1) is not compatible with interface '"
                   << rhs.repr_str() << "' (2)\n"
-                   << "  (1) " << *self_schema << '\n'
-                   << "  (2) " << schema << '\n';
+                   << "  (1) " << *self_schema << "\n"
+                   << "  (2) " << schema << "\n";
          return false;
        }
        return false;
@ -1078,7 +1078,7 @@ SymbolicShape SymbolicShape::merge(const SymbolicShape& other) const {
 }

 void SymbolicShape::dump() const {
-  std::cout << *this << '\n';
+  std::cout << *this << "\n";
 }

 bool EnumType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
--- a/aten/src/ATen/core/union_type.cpp
+++ b/aten/src/ATen/core/union_type.cpp
@ -205,9 +205,9 @@ UnionType::UnionType(std::vector<TypePtr> reference, TypeKind kind) : SharedType
    for (const auto i : c10::irange(reference.size())) {
      msg << reference[i]->repr_str();
      if (i > 0) {
-        msg << ',';
+        msg << ",";
      }
-      msg << ' ';
+      msg << " ";
    }
    msg << "} has the single type " << types_[0]->repr_str()
         << ". Use the common supertype instead of creating a Union"
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@ -223,62 +223,6 @@ CONVERT_FROM_BF16_TEMPLATE(double)
 CONVERT_FROM_BF16_TEMPLATE(float16_t)
 #endif

-#ifdef __ARM_FEATURE_BF16
-
-// clang-[17, 20] crashes when autovectorizing static cast to bf16
-// Below is a workaround to have some vectorization
-// Works decently well for smaller int types
-template <typename from_type>
-inline void convertToBf16Impl(
-    const from_type* __restrict src,
-    c10::BFloat16* __restrict dst,
-    uint64_t n) {
-  bfloat16_t* dstPtr = reinterpret_cast<bfloat16_t*>(dst);
-  uint64_t loopBound = n - (n % 16);
-  uint64_t i = 0;
-  for (; i < loopBound; i += 16) {
-    float32x4_t a, b, c, d;
-    a[0] = static_cast<float>(src[i]);
-    a[1] = static_cast<float>(src[i + 1]);
-    a[2] = static_cast<float>(src[i + 2]);
-    a[3] = static_cast<float>(src[i + 3]);
-    b[0] = static_cast<float>(src[i + 4]);
-    b[1] = static_cast<float>(src[i + 5]);
-    b[2] = static_cast<float>(src[i + 6]);
-    b[3] = static_cast<float>(src[i + 7]);
-    c[0] = static_cast<float>(src[i + 8]);
-    c[1] = static_cast<float>(src[i + 9]);
-    c[2] = static_cast<float>(src[i + 10]);
-    c[3] = static_cast<float>(src[i + 11]);
-    d[0] = static_cast<float>(src[i + 12]);
-    d[1] = static_cast<float>(src[i + 13]);
-    d[2] = static_cast<float>(src[i + 14]);
-    d[3] = static_cast<float>(src[i + 15]);
-
-    vst1q_bf16(dstPtr + i, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(a), b));
-    vst1q_bf16(dstPtr + i + 8, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(c), d));
-  }
-
-#pragma clang loop vectorize(disable) interleave(disable) unroll(disable)
-  for (; i < n; i++) {
-    float a = static_cast<float>(src[i]);
-    dstPtr[i] = vcvth_bf16_f32(a);
-  }
-}
-
-#define CONVERT_TO_BF16_TEMPLATE(from_type)                                  \
-  template <>                                                                \
-  inline void convert(const from_type* src, c10::BFloat16* dst, int64_t n) { \
-    return convertToBf16Impl<from_type>(src, dst, n);                        \
-  }
-
-CONVERT_TO_BF16_TEMPLATE(uint8_t)
-CONVERT_TO_BF16_TEMPLATE(int8_t)
-CONVERT_TO_BF16_TEMPLATE(int16_t)
-CONVERT_TO_BF16_TEMPLATE(int32_t)
-
-#endif
-
 inline void convertBoolToBfloat16Impl(
    const bool* __restrict src,
    c10::BFloat16* __restrict dst,
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@ -11,8 +11,6 @@
 #include <sleef.h>
 #endif

-C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
-
 // Sleef offers vectorized versions of some transcedentals
 // such as sin, cos, tan etc..
 // However for now opting for STL, since we are not building
@ -652,5 +650,3 @@ inline Vectorized<float> Vectorized<float>::erf() const {

 } // namespace CPU_CAPABILITY
 } // namespace at::vec
-
-C10_DIAGNOSTIC_POP()
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@ -80,7 +80,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
    }
    stream << buf[i];
  }
-  stream << ']';
+  stream << "]";
  return stream;
 }

--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
    }
    stream << buf[i];
  }
-  stream << ']';
+  stream << "]";
  return stream;
 }

--- a/aten/src/ATen/cuda/CUDAContextLight.h
+++ b/aten/src/ATen/cuda/CUDAContextLight.h
@ -3,7 +3,6 @@

 #include <cstdint>
 #include <map>
-#include <shared_mutex>

 #include <cuda_runtime_api.h>
 #include <cusparse.h>
@ -89,13 +88,8 @@ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
 TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();

 TORCH_CUDA_CPP_API void clearCublasWorkspaces();
-struct WorkspaceMapWithMutex {
-  std::map<std::tuple<void*, void*>, at::DataPtr> map;
-  std::shared_mutex mutex;
-};
-
-TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
-TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace();
 TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
 TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
 TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -1,7 +1,6 @@
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/cuda/CUDAGraph.h>
 #include <ATen/cuda/Exceptions.h>
-#include <ATen/cuda/MemPool.h>
 #include <ATen/Functions.h>
 #include <c10/cuda/CUDAFunctions.h>

@ -14,7 +13,7 @@ static bool _cuda_graphs_debug = false;
 MempoolId_t graph_pool_handle() {
  // Sets just the second value, to distinguish it from MempoolId_ts created from
  // cudaStreamGetCaptureInfo id_s in capture_begin.
-  return at::cuda::MemPool::graph_pool_handle();
+  return c10::cuda::MemPool::graph_pool_handle();
 }

 /**
@ -91,7 +90,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
  } else {
    // User did not ask us to share a mempool. Create graph pool handle using is_user_created=false.
    // Sets just the first value, to distinguish it from MempoolId_ts created by graph_pool_handle().
-    mempool_id_ = at::cuda::MemPool::graph_pool_handle(false);
+    mempool_id_ = c10::cuda::MemPool::graph_pool_handle(false);
    TORCH_INTERNAL_ASSERT(mempool_id_.first > 0);
  }

@ -175,24 +174,17 @@ void CUDAGraph::instantiate() {
    // Trailing NULL, NULL, 0 arguments were recommended by Cuda driver people,
    // who prefer not to report error message through these arguments moving forward
    // (they prefer return value, or errors on api calls internal to the capture)
-    // ROCM appears to fail with HIP error: invalid argument
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && !defined(USE_ROCM)
-    AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, cudaGraphInstantiateFlagUseNodePriority));
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000)
+    AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, 0));
 #else
    AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
 #endif
 //Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory.
 //It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch.
  } else {
-#if !defined(USE_ROCM)
-    AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
-                                                graph_,
-                                                cudaGraphInstantiateFlagAutoFreeOnLaunch | cudaGraphInstantiateFlagUseNodePriority));
-#else
    AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
                                                graph_,
                                                cudaGraphInstantiateFlagAutoFreeOnLaunch));
-#endif
  }
  has_graph_exec_ = true;
 }
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@ -99,7 +99,7 @@ void destroyCublasHandle(cublasHandle_t handle) {
 //   - Comments of @soumith copied from cuDNN handle pool implementation
 #ifdef NO_CUDNN_DESTROY_HANDLE
 #else
-  cublasDestroy(handle);
+    cublasDestroy(handle);
 #endif
 }

@ -107,27 +107,19 @@ using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle

 } // namespace

-WorkspaceMapWithMutex& cublas_handle_stream_to_workspace() {
-  static auto& instance = *new WorkspaceMapWithMutex;
+std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace() {
+  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
  return instance;
 }

-WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace() {
-  static auto& instance = *new WorkspaceMapWithMutex;
+std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
+  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
  return instance;
 }

 void clearCublasWorkspaces() {
-  {
-    auto& workspace = cublas_handle_stream_to_workspace();
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    workspace.map.clear();
-  }
-  {
-    auto& workspace = cublaslt_handle_stream_to_workspace();
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    workspace.map.clear();
-  }
+  cublas_handle_stream_to_workspace().clear();
+  cublaslt_handle_stream_to_workspace().clear();
 }

 size_t parseChosenWorkspaceSize() {
@ -241,38 +233,6 @@ at::DataPtr getNewCUDABlasLtWorkspace() {
  return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize());
 }

-void setWorkspaceForHandle(cublasHandle_t handle, c10::cuda::CUDAStream stream) {
-  cudaStream_t _stream = stream;
-  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-
-  auto& workspace = cublas_handle_stream_to_workspace();
-
-  size_t workspace_size = getChosenWorkspaceSize();
-
-  // Fast path: check if workspace already exists
-  {
-    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.find(key);
-    if (workspace_it != workspace.map.end()) {
-      TORCH_CUDABLAS_CHECK(cublasSetWorkspace(
-          handle, workspace_it->second.get(), workspace_size));
-      return;
-    }
-  }
-
-  // Slow path: allocate workspace outside the lock
-  auto new_workspace = getNewWorkspace();
-
-  // Insert with lock (double-check in case another thread inserted while we
-  // were allocating)
-  {
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.try_emplace(key, std::move(new_workspace)).first;
-    TORCH_CUDABLAS_CHECK(
-        cublasSetWorkspace(handle, workspace_it->second.get(), workspace_size));
-  }
-}
-
 void* getCUDABlasLtWorkspace() {
 #ifndef USE_ROCM
  static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
@ -281,10 +241,8 @@ void* getCUDABlasLtWorkspace() {
    auto stream = c10::cuda::getCurrentCUDAStream();
    cudaStream_t _stream = stream;
    auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-    auto& workspace = at::cuda::cublas_handle_stream_to_workspace();
-    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.find(key);
-    TORCH_INTERNAL_ASSERT(workspace_it != workspace.map.end());
+    auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
+    TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
    return workspace_it->second.mutable_get();
  }
 #endif
@ -292,29 +250,11 @@ void* getCUDABlasLtWorkspace() {
  auto stream = c10::cuda::getCurrentCUDAStream();
  cudaStream_t _stream = stream;
  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-
-  auto& workspace = cublaslt_handle_stream_to_workspace();
-
-  // Fast path: check if workspace already exists
-  {
-    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.find(key);
-    if (workspace_it != workspace.map.end()) {
-      return workspace_it->second.mutable_get();
-    }
-  }
-
-  // Slow path: allocate workspace outside the lock
-  auto new_workspace = getNewCUDABlasLtWorkspace();
-
-  // Insert with lock (double-check in case another thread inserted while we
-  // were allocating)
-  {
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it =
-          workspace.map.try_emplace(key, std::move(new_workspace)).first;
-    return workspace_it->second.mutable_get();
+  auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
+  if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
+    workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
  }
+  return workspace_it->second.mutable_get();
 }

 cublasHandle_t getCurrentCUDABlasHandle() {
@ -358,8 +298,13 @@ cublasHandle_t getCurrentCUDABlasHandle() {
  // will allocate memory dynamically (even if they're cheap) outside
  // PyTorch's CUDA caching allocator. It's possible that CCA used up
  // all the memory and cublas's cudaMallocAsync will return OOM
-  setWorkspaceForHandle(handle, stream);
-
+  cudaStream_t _stream = stream;
+  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
+  auto workspace_it = cublas_handle_stream_to_workspace().find(key);
+  if (workspace_it == cublas_handle_stream_to_workspace().end()) {
+    workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
+  }
+  TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
 #if !defined(USE_ROCM)
  // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
  // FP32 data type calculations based on the value of the allow_tf32 flag.
--- a/aten/src/ATen/cuda/MemPool.cpp
+++ b/aten/src/ATen/cuda/MemPool.cpp
@ -1,69 +0,0 @@
-#include <ATen/core/CachingHostAllocator.h>
-#include <ATen/cuda/MemPool.h>
-
-namespace at::cuda {
-
-// uid_ is incremented when a user creates a MemPool,
-// for example: using graph_pool_handle() or c10::cuda::MemPool().
-//
-// uuid_ is incremented when CUDAGraph creates a MemPool
-// as a result of a user not providing a pool.
-//
-// MempoolId_t of {0, 0} is used to denote when no MemPool has been
-// passed to a function, either by user or CUDAGraphs. For example,
-// default value of MempoolId_t for capture_begin function is {0, 0}.
-// That's why uid_ and uuid_ start at 1.
-std::atomic<CaptureId_t> MemPool::uid_{1};
-std::atomic<CaptureId_t> MemPool::uuid_{1};
-
-MemPool::MemPool(
-    CUDACachingAllocator::CUDAAllocator* allocator,
-    bool is_user_created,
-    bool use_on_oom)
-    : allocator_(allocator), is_user_created_(is_user_created) {
-  if (is_user_created_) {
-    id_ = {0, uid_++};
-  } else {
-    id_ = {uuid_++, 0};
-  }
-  device_ = c10::cuda::current_device();
-  CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator);
-  if (use_on_oom) {
-    CUDACachingAllocator::setUseOnOOM(device_, id_);
-  }
-}
-
-MemPool::~MemPool() {
-  // TORCH_INTERNAL_ASSERT(use_count() == 1);
-  // We used to assert that TORCH_INTERNAL_ASSERT(use_count() == 1);
-  // However, this assertion is not true if a memory pool is shared
-  // with a cuda graph. That CUDAGraph will increase the use count
-  // until it is reset.
-  CUDACachingAllocator::releasePool(device_, id_);
-  c10::cuda::CUDACachingAllocator::emptyCache(id_);
-}
-
-MempoolId_t MemPool::id() {
-  return id_;
-}
-
-CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
-  return allocator_;
-}
-
-int MemPool::use_count() {
-  return CUDACachingAllocator::getPoolUseCount(device_, id_);
-}
-
-c10::DeviceIndex MemPool::device() {
-  return device_;
-}
-
-MempoolId_t MemPool::graph_pool_handle(bool is_user_created) {
-  if (is_user_created) {
-    return {0, uid_++};
-  }
-  return {uuid_++, 0};
-}
-
-} // namespace at::cuda
--- a/aten/src/ATen/cuda/MemPool.h
+++ b/aten/src/ATen/cuda/MemPool.h
@ -1,44 +0,0 @@
-#pragma once
-
-#include <c10/core/Allocator.h>
-#include <c10/cuda/CUDACachingAllocator.h>
-
-namespace at::cuda {
-
-// Keep BC only
-using c10::CaptureId_t;
-using c10::MempoolId_t;
-
-// MemPool represents a pool of memory in a caching allocator. Currently,
-// it's just the ID of the pool object maintained in the CUDACachingAllocator.
-//
-// An allocator pointer can be passed to the MemPool to define how the
-// allocations should be done in the pool. For example: using a different
-// system allocator such as ncclMemAlloc.
-struct TORCH_CUDA_CPP_API MemPool {
-  MemPool(
-      c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
-      bool is_user_created = true,
-      bool use_on_oom = false);
-  MemPool(const MemPool&) = delete;
-  MemPool(MemPool&&) = default;
-  MemPool& operator=(const MemPool&) = delete;
-  MemPool& operator=(MemPool&&) = default;
-  ~MemPool();
-
-  MempoolId_t id();
-  c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator();
-  int use_count();
-  c10::DeviceIndex device();
-  static MempoolId_t graph_pool_handle(bool is_user_created = true);
-
- private:
-  static std::atomic<CaptureId_t> uid_;
-  static std::atomic<CaptureId_t> uuid_;
-  c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator_;
-  bool is_user_created_;
-  MempoolId_t id_;
-  c10::DeviceIndex device_;
-};
-
-} // namespace at::cuda
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -411,16 +411,16 @@ std::string CUDAHooks::showConfig() const {
    // HIP_VERSION value format was changed after ROCm v4.2 to include the patch number
    if(v < 500) {
      // If major=xx, minor=yy then format -> xxyy
-      oss << (v / 100) << '.' << (v % 10);
+      oss << (v / 100) << "." << (v % 10);
    }
    else {
      // If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz
-      oss << (v / 10000000) << '.' << (v / 100000 % 100) << '.' << (v % 100000);
+      oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000);
    }
 #else
-    oss << (v / 1000) << '.' << (v / 10 % 100);
+    oss << (v / 1000) << "." << (v / 10 % 100);
    if (v % 10 != 0) {
-      oss << '.' << (v % 10);
+      oss << "." << (v % 10);
    }
 #endif
  };
@ -431,16 +431,16 @@ std::string CUDAHooks::showConfig() const {
  oss << "  - HIP Runtime ";
 #endif
  printCudaStyleVersion(runtimeVersion);
-  oss << '\n';
+  oss << "\n";

  // TODO: Make HIPIFY understand CUDART_VERSION macro
 #if !defined(USE_ROCM)
  if (runtimeVersion != CUDART_VERSION) {
    oss << "  - Built with CUDA Runtime ";
    printCudaStyleVersion(CUDART_VERSION);
-    oss << '\n';
+    oss << "\n";
  }
-  oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << '\n';
+  oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << "\n";
 #endif

 #if !defined(USE_ROCM)
@ -448,9 +448,9 @@ std::string CUDAHooks::showConfig() const {


  auto printCudnnStyleVersion = [&](size_t v) {
-    oss << (v / 1000) << '.' << (v / 100 % 10);
+    oss << (v / 1000) << "." << (v / 100 % 10);
    if (v % 100 != 0) {
-      oss << '.' << (v % 100);
+      oss << "." << (v % 100);
    }
  };

@ -461,22 +461,22 @@ std::string CUDAHooks::showConfig() const {
  if (cudnnCudartVersion != CUDART_VERSION) {
    oss << "  (built against CUDA ";
    printCudaStyleVersion(cudnnCudartVersion);
-    oss << ')';
+    oss << ")";
  }
-  oss << '\n';
+  oss << "\n";
  if (cudnnVersion != CUDNN_VERSION) {
    oss << "    - Built with CuDNN ";
    printCudnnStyleVersion(CUDNN_VERSION);
-    oss << '\n';
+    oss << "\n";
  }
 #endif
 #else
  // TODO: Check if miopen has the functions above and unify
-  oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << '.' << MIOPEN_VERSION_MINOR << '.' << MIOPEN_VERSION_PATCH << '\n';
+  oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << "." << MIOPEN_VERSION_MINOR << "." << MIOPEN_VERSION_PATCH << "\n";
 #endif

 #if AT_MAGMA_ENABLED()
-  oss << "  - Magma " << MAGMA_VERSION_MAJOR << '.' << MAGMA_VERSION_MINOR << '.' << MAGMA_VERSION_MICRO << '\n';
+  oss << "  - Magma " << MAGMA_VERSION_MAJOR << "." << MAGMA_VERSION_MINOR << "." << MAGMA_VERSION_MICRO << "\n";
 #endif

  return oss.str();
--- a/aten/src/ATen/cuda/jiterator.cu
+++ b/aten/src/ATen/cuda/jiterator.cu
@ -42,7 +42,7 @@ static inline void launch_jitted_vectorized_kernel_dynamic(

  // The cache key includes all the parameters to generate_code + vec_size + dev_idx
  std::stringstream ss;
-  ss << nInputs << '_' << nOutputs << f;
+  ss << nInputs << "_" << nOutputs << f;
  ss << f_inputs_type_str << compute_type_str << result_type_str;
  ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
  ss << extra_args_types;
@ -144,7 +144,7 @@ static inline void launch_jitted_unrolled_kernel_dynamic(

  // The cache key includes all the parameters to generate_code + dev_idx
  std::stringstream ss;
-  ss << nInputs << '_' << nOutputs << f;
+  ss << nInputs << "_" << nOutputs << f;
  ss << f_inputs_type_str << compute_type_str << result_type_str;
  ss << contiguous << dynamic_casting;
  ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -52,10 +52,10 @@ TuningContext* getTuningContext() {
 std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry) {
  static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
  if (!blaslog) {
-    return stream << entry.key_ << ',' << entry.time_;
+    return stream << entry.key_ << "," << entry.time_;
  }
  else {
-    return stream << entry.key_ << ',' << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
+    return stream << entry.key_ << "," << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
  }
 }

@ -156,10 +156,10 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std
    if (isNew) {
      static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
      if (!blaslog) {
-        untuned_file << op_signature << ',' << params_signature << std::endl;
+        untuned_file << op_signature << "," << params_signature << std::endl;
      }
      else {
-        untuned_file << op_signature << ',' << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
+        untuned_file << op_signature << "," << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
      }
      TUNABLE_LOG3("Untuned,", op_signature, ",", params_signature);
    }
@ -201,7 +201,7 @@ void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const

  if(!file_exists || file_empty) {
    for(const auto& [key, val] : validators) {
-      (*realtime_out_) << "Validator," << key << ',' << val << std::endl;
+      (*realtime_out_) << "Validator," << key << "," << val << std::endl;
      realtime_out_->flush();
    }
    validators_written_ = true;
@ -219,7 +219,7 @@ void TuningResultsManager::AppendResultLine(const std::string& op_sig, const std
    return;
  }

-  (*realtime_out_) << op_sig << ',' << param_sig << ',' << result << std::endl;
+  (*realtime_out_) << op_sig << "," << param_sig << "," << result << std::endl;
  realtime_out_->flush(); //ensure immediate write to disk

  TUNABLE_LOG3("Realtime append: ", op_sig, "(", param_sig, ") -> ", result);
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@ -93,31 +93,31 @@ std::string cudnnTypeToString(cudnnDataType_t dtype) {
      return "CUDNN_DATA_UINT8x4";
    default:
      std::ostringstream oss;
-      oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
      return oss.str();
  }
 }

 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
-  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
  int nbDims = 0;
  int dimA[CUDNN_DIM_MAX];
  int strideA[CUDNN_DIM_MAX];
  cudnnDataType_t dtype{};
  cudnnGetTensorNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &nbDims, dimA, strideA);
-  out << "    type = " << cudnnTypeToString(dtype) << '\n';
-  out << "    nbDims = " << nbDims << '\n';
+  out << "    type = " << cudnnTypeToString(dtype) << "\n";
+  out << "    nbDims = " << nbDims << "\n";
  // Read out only nbDims of the arrays!
  out << "    dimA = ";
  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << '\n';
+  out << "\n";
  out << "    strideA = ";
  for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << '\n';
+  out << "\n";
  return out;
 }

@ -168,27 +168,27 @@ std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {
      return "CUDNN_TENSOR_NHWC";
    default:
      std::ostringstream oss;
-      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ')';
+      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ")";
      return oss.str();
  }
 }

 std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) {
-  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << '\n';
+  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << "\n";
  int nbDims = 0;
  int dimA[CUDNN_DIM_MAX];
  cudnnDataType_t dtype{};
  cudnnTensorFormat_t tformat{};
  cudnnGetFilterNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &tformat, &nbDims, dimA);
-  out << "    type = " << cudnnTypeToString(dtype) << '\n';
-  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << '\n';
-  out << "    nbDims = " << nbDims << '\n';
+  out << "    type = " << cudnnTypeToString(dtype) << "\n";
+  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << "\n";
+  out << "    nbDims = " << nbDims << "\n";
  // Read out only nbDims of the arrays!
  out << "    dimA = ";
  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << '\n';
+  out << "\n";
  return out;
 }

--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@ -346,15 +346,15 @@ void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int6
 }

 std::ostream& operator<< (std::ostream& os, const DynamicLayer& layer) {
-  os << layer.layerId() << ':' << layer.key();
+  os << layer.layerId() << ":" << layer.key();
  return os;
 }
 std::ostream& operator<< (std::ostream& os, const std::vector<DynamicLayer>& dls) {
  os << "DynamicLayerStack[ ";
  for (const auto& layer : dls) {
-    os << layer << ' ';
+    os << layer << " ";
  }
-  os << ']';
+  os << "]";
  return os;
 }

--- a/aten/src/ATen/functorch/TensorWrapper.cpp
+++ b/aten/src/ATen/functorch/TensorWrapper.cpp
@ -22,7 +22,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
    if (batched) {
      ss << "Batched[lvl=" << batched->level() << " dim=" << batched->bdim() << ", ";
      dumpTensor(ss, batched->value());
-      ss << ']';
+      ss << "]";
      return;
    }
    ss << "Tensor" << tensor.sizes();
@ -36,7 +36,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
    ss << "dead, ";
  }
  dumpTensor(ss, wrapped->value());
-  ss << ']';
+  ss << "]";
 }

 void TensorWrapper::refreshMetadata() {
--- a/aten/src/ATen/miopen/Descriptors.cpp
+++ b/aten/src/ATen/miopen/Descriptors.cpp
@ -73,32 +73,32 @@ std::string miopenTypeToString(miopenDataType_t dtype) {
      return "miopenBFloat16";
    default:
      std::ostringstream oss;
-      oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
      return oss.str();
  }
 }

 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
-  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
  int nbDims = 0;
  int dimA[MIOPEN_DIM_MAX];
  int strideA[MIOPEN_DIM_MAX];
  miopenDataType_t dtype;
  miopenGetTensorDescriptorSize(d.desc(), &nbDims);
  miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
-  out << "    type = " << miopenTypeToString(dtype) << '\n';
-  out << "    nbDims = " << nbDims << '\n';
+  out << "    type = " << miopenTypeToString(dtype) << "\n";
+  out << "    nbDims = " << nbDims << "\n";
  // Read out only nbDims of the arrays!
  out << "    dimA = ";
  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << '\n';
+  out << "\n";
  out << "    strideA = ";
  for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << '\n';
+  out << "\n";
  return out;
 }

--- a/aten/src/ATen/mps/MPSProfiler.h
+++ b/aten/src/ATen/mps/MPSProfiler.h
@ -91,7 +91,7 @@ struct OperationInfo : BaseInfo {
    std::stringstream kernelStr;
    kernelStr << kernelName;
    for (const Tensor& tensor : tensors) {
-      kernelStr << ':' << BaseInfo::buildTensorString(tensor, includeBufferId);
+      kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
    }
    return kernelStr.str();
  }
--- a/aten/src/ATen/mps/MPSProfiler.mm
+++ b/aten/src/ATen/mps/MPSProfiler.mm
@ -39,9 +39,9 @@ std::string BaseInfo::buildTensorString(const Tensor& tensor, bool includeBuffer
    // see comments for INCLUDE_BUFFER_ID
    if (includeBufferId && deviceType == at::kMPS) {
      id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
-      tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ':' << buffer.retainCount << ')';
+      tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ":" << buffer.retainCount << ")";
    }
-    tensorStr << ':' << tensor.scalar_type() << tensor.sizes();
+    tensorStr << ":" << tensor.scalar_type() << tensor.sizes();
    return tensorStr.str();
  } else {
    return "undefined";
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@ -167,7 +167,7 @@ static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, co
    std::stringstream ss;
    ss << arg_name << " should be greater than zero but got (";
    std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
-    ss << args.back() <<  ")" << " (while checking arguments for " << c << ')';
+    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
    TORCH_CHECK(false, ss.str());
  }
 }
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -639,7 +639,7 @@ static std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params)
      << "  deterministic = " << params.deterministic
      << "  cudnn_enabled = " << params.cudnn_enabled
      << "  allow_tf32 = " << params.allow_tf32
-      << '}';
+      << "}";
  return out;
 }

--- a/aten/src/ATen/native/PackedSequence.cpp
+++ b/aten/src/ATen/native/PackedSequence.cpp
@ -142,7 +142,6 @@ Tensor _pack_padded_sequence_backward_symint(const Tensor& grad, c10::SymIntArra
 std::tuple<Tensor, Tensor> _pad_packed_sequence(const Tensor& data, const Tensor& _batch_sizes, bool batch_first, const Scalar& padding_value, int64_t total_length) {
  auto batch_sizes_t = _batch_sizes.contiguous();
  checkLongTensor(batch_sizes_t);
-  TORCH_CHECK(batch_sizes_t.numel() > 0, "batch_sizes can not be empty");

  int64_t * batch_sizes = batch_sizes_t.data_ptr<int64_t>();
  int64_t max_batch_size = batch_sizes[0];
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@ -847,7 +847,7 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const std::optional<int64_t
       << ", hop_length=" << hop_length << ", win_length=" << win_length \
       << ", window="; \
    if (window.defined()) { \
-      SS << window.toString() << '{' << window.sizes() << '}'; \
+      SS << window.toString() << "{" << window.sizes() << "}"; \
    } else { \
      SS << "None"; \
    } \
@ -1046,7 +1046,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const std::optional<int64_
       << ", hop_length=" << hop_length << ", win_length=" << win_length \
       << ", window="; \
    if (window.defined()) { \
-      SS << window.toString() << '{' << window.sizes() << '}'; \
+      SS << window.toString() << "{" << window.sizes() << "}"; \
    } else { \
      SS << "None"; \
    } \
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@ -1087,8 +1087,7 @@ TORCH_IMPL_FUNC(index_copy_out)
    result.copy_(self);

  // See Note [Enabling Deterministic Operations]
-  if ((result.is_cuda() || result.is_xpu()) &&
-      globalContext().deterministicAlgorithms()) {
+  if (result.is_cuda() && globalContext().deterministicAlgorithms()) {
    torch::List<std::optional<Tensor>> indices;
    indices.resize(dim + 1);
    indices.set(dim, index);
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -523,7 +523,7 @@ Tensor _functional_assert_async_msg_cpu(
 }

 void _print(std::string_view s) {
-  std::cout << s << '\n';
+  std::cout << s << "\n";
 }

 // Sorting-based algorithm for isin(); used when the number of test elements is
--- a/aten/src/ATen/native/TransposeType.h
+++ b/aten/src/ATen/native/TransposeType.h
@ -1,8 +1,6 @@
 #pragma once
 #include <c10/util/Exception.h>

-C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
-
 namespace at::native {

 // Used as an interface between the different BLAS-like libraries
@ -23,5 +21,3 @@ static inline char to_blas(TransposeType trans) {
 }

 }  // namespace at::native
-
-C10_DIAGNOSTIC_POP()
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@ -904,11 +904,19 @@ Tensor mvlgamma(const Tensor& self, int64_t p) {
  return args.lgamma_().sum(-1).add_(p2_sub_p * std::log(c10::pi<double>) * QUARTER);
 }

-// since mvlgamma_ has different signature from its
-// out and functional variant, we explicitly
-// define it (instead of using structured kernel).
 Tensor& mvlgamma_(Tensor& self, int64_t p) {
-  return at::mvlgamma_out(self, self, p);
+  mvlgamma_check(self, p);
+  Tensor args = native::arange(
+      -p *HALF  + HALF,
+      HALF,
+      HALF,
+      optTypeMetaToScalarType(self.options().dtype_opt()),
+      self.options().layout_opt(),
+      self.options().device_opt(),
+      self.options().pinned_memory_opt());
+  args = args.add(self.unsqueeze(-1));
+  const auto p2_sub_p = static_cast<double>(p * (p - 1));
+  return self.copy_(args.lgamma_().sum(-1).add_(p2_sub_p * std::log(c10::pi<double>) * QUARTER));
 }

 Tensor& mvlgamma_out(const Tensor& self, int64_t p, Tensor& result) {
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -296,7 +296,7 @@ template <typename scalar_t, typename res_scalar_t = scalar_t>
 bool launchGemmAndBiasCublasLt(
    // args contains result which is modified
    cublasCommonArgs& args,
-    const std::optional<Tensor>& self,
+    const Tensor& self,
    const Scalar& alpha,
    Activation activation = Activation::None
 ) {
@ -304,8 +304,12 @@ bool launchGemmAndBiasCublasLt(
  // or when it can be squeezed to 1D.
  // self_ptr == nullptr implies ignore bias epilogue
  // and use standard gemm-like API.
-  const auto* self_ptr = self.has_value() ? self.value().const_data_ptr<scalar_t>() : static_cast<const scalar_t*>(nullptr);
-
+  const auto* self_ptr = [&]() -> auto {
+    if (self.dim() == 1 || self.squeeze().dim() == 1) {
+      return self.const_data_ptr<scalar_t>();
+    }
+    return static_cast<const scalar_t*>(nullptr);
+  }();

  const auto tuning_ctx = at::cuda::tunable::getTuningContext();
  if (tuning_ctx->IsTunableOpEnabled()) {
@ -388,30 +392,35 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
  #ifdef USE_ROCM
  // Conditioned on the device index, which is not persistent
-  disable_addmm_cuda_lt = disable_addmm_cuda_lt || isGloballyDisabledAddmmCudaLt(self.device());
+  disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
  #endif
  // Condition on the input
-  disable_addmm_cuda_lt = disable_addmm_cuda_lt || !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation);
+  disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
+  // }

  at::ScalarType scalar_type = mat1.scalar_type();
  bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;

-  #ifdef USE_ROCM
-  disable_addmm_cuda_lt = disable_addmm_cuda_lt || is_float_output_with_half_input;
-  #endif
-
-  bool use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
-  // for float output with half input cublasLT with bias produces wrong results
-  use_bias_ptr_lt &= !is_float_output_with_half_input;
-
  // Handle result/self shapes
  if (!result.is_same(self)) {
    at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});

-      // We do not copy bias only when we need the bias ptr
+    // We use bias ptr in the Lt path only when bias is 1D
+    const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
+    const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
+      if (!use_bias_ptr_lt) {
+        // We do expand self even before
+        // check for beta != 0.0 to make sure that
+        // test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
+        // runs green.
+        return expand_size(self, result.sizes(), "addmm");
+      }
+      return c10::MaybeOwned<Tensor>::borrowed(self);
+    }();
+    // We do not copy bias only when we need the bias ptr
    if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
      // NOTE: self should broadcast over result
-      at::native::copy_(result, *expand_size(self, result.sizes(), "addmm"));
+      at::native::copy_(result, *self_maybe_expanded);
    }
  }

@ -459,7 +468,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        scalar_type,
        "addmm_cuda_lt",
        [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
+          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, self, alpha, activation);
        }
      );
      #endif
@ -471,7 +480,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        scalar_type,
        "addmm_cuda_lt",
        [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
+          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, self, alpha, activation);
        }
      );
    } // end is_float_output_with_half_input
@ -927,7 +936,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
  return _int_mm_out_cuda(self, mat2, result);
 }

-static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
+static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
  // ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
  TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
  TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
@ -951,7 +960,7 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");

-  if (self_baddbmm.has_value()) {
+  if (!is_bmm && self_baddbmm.has_value()) {
    const auto& self = self_baddbmm.value();
    TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
    TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
@ -959,12 +968,15 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
 }

 Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
-  Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
+  IntArrayRef batch1_sizes = batch1.sizes();
+  IntArrayRef batch2_sizes = batch2.sizes();
+
+  Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
  return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
 }

 Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype);
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
  Scalar beta(0.0);
  Scalar alpha(1.0);
  {
@ -976,16 +988,14 @@ Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at
 }

 Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  TORCH_CHECK(self.scalar_type() == out_dtype || self.scalar_type() == batch1.dtype(),
-  "self dtype must match either out_dtype or batch1 dtype");
-  Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
-  return _baddbmm_out_dtype_cuda(self, batch1, batch2, out_dtype, beta, alpha, out);
+  // We need to copy the tensor
+  Tensor out = self.clone().to(self.options().dtype(out_dtype));
+
+  return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
 }

 Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, out);
-  // We need to copy the tensor
-  out.copy_(self);
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
  {
    NoNamesGuard guard;
    baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
@ -1020,27 +1030,24 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
 }

 Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
-  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
-  Tensor result = at::empty({mat1.size(0), mat2.size(1)}, self.options().dtype(out_dtype));
+  Tensor result = at::empty(self.sizes(), self.options().dtype(out_dtype));
  return _addmm_dtype_out_cuda(self, mat1, mat2, out_dtype, beta, alpha, result);
 }

 Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-// repeat dimensionality checks for direct calls to `out` overload
+  TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
+  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
  TORCH_CHECK(
      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
-  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
-  TORCH_CHECK(out_dtype == mat1.scalar_type() ||
-  (out_dtype == at::ScalarType::Float && (mat1.scalar_type() == at::ScalarType::Half || mat1.scalar_type() == at::ScalarType::BFloat16)),
-  "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");

  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
-  TORCH_CHECK(out_dtype == self.scalar_type() || self.scalar_type() == mat1.scalar_type(),
-    "self dtype must match either out_dtype or mat1 dtype");
+  TORCH_CHECK(out_dtype == self.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");

  addmm_out_cuda_impl(out, self, mat1, mat2, beta, alpha);

--- a/aten/src/ATen/native/cuda/CompositeRandomAccessor.h
+++ b/aten/src/ATen/native/cuda/CompositeRandomAccessor.h
@ -1,7 +1,6 @@
 #pragma once

 #include <ATen/native/CompositeRandomAccessorCommon.h>
-#include <thrust/swap.h>
 #include <thrust/tuple.h>

 namespace at { namespace native {
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@ -75,52 +75,30 @@ static inline bool can_use_int32_nhwc(
  return true;
 }

-static inline bool can_use_int32_nchw(
-    int64_t nbatch, int64_t channels,
-    int64_t height, int64_t width,
-    int64_t pooled_height, int64_t pooled_width) {
-  int64_t hw = height * width;
-  return can_use_int32_nhwc(
-      nbatch, channels, height, width,
-      pooled_height, pooled_width,
-      channels * hw,  // in_stride_n
-      hw, // in_stride_c
-      width, // in_stride_h
-      1 // in_stride_w
-  );
-}
-
 // kernels borrowed from Caffe
-template <typename scalar_t, typename index_t>
-__global__ void max_pool_forward_nchw(
-    const index_t nthreads,
-    const scalar_t* bottom_data,
-    const int64_t channels,
-    const int64_t height,
-    const int64_t width,
-    const int pooled_height,
-    const int pooled_width,
-    const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w,
-    const int pad_h, const int pad_w,
-    const int dilation_h, const int dilation_w,
-    scalar_t* top_data,
+template <typename scalar_t>
+__global__ void max_pool_forward_nchw(const int nthreads, const scalar_t* bottom_data,
+    const int64_t channels, const int64_t height,
+    const int64_t width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, scalar_t* top_data,
    int64_t* top_mask) {
-  CUDA_KERNEL_LOOP_TYPE(index, nthreads, index_t) {
-    index_t pw = index % pooled_width;
-    index_t ph = (index / pooled_width) % pooled_height;
-    index_t c = (index / pooled_width / pooled_height) % channels;
-    index_t n = index / pooled_width / pooled_height / channels;
-    index_t hstart = ph * stride_h - pad_h;
-    index_t wstart = pw * stride_w - pad_w;
-    index_t hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
-    index_t wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
+    int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
    while(hstart < 0)
      hstart += dilation_h;
    while(wstart < 0)
      wstart += dilation_w;
    scalar_t maxval = at::numeric_limits<scalar_t>::lower_bound(); // -Infinity
-    index_t maxidx = hstart * width + wstart;
+    int maxidx = hstart * width + wstart;
    const scalar_t* btm_data = bottom_data + (n * channels + c) * height * width;
    for (int h = hstart; h < hend; h += dilation_h) {
      for (int w = wstart; w < wend; w += dilation_w) {
@ -273,39 +251,32 @@ __global__ void max_pool_forward_nhwc(

 static constexpr int BLOCK_THREADS = 256;

-template <typename scalar_t, typename accscalar_t, typename index_t>
+template <typename scalar_t, typename accscalar_t>
 #if defined (USE_ROCM)
 C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 4)
 #else
 C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 8)
 #endif
-__global__ void max_pool_backward_nchw(
-    const scalar_t* top_diff,
-    const int64_t* top_mask,
-    const index_t num,
-    const index_t channels,
-    const index_t height,
-    const index_t width,
-    const index_t pooled_height,
-    const index_t pooled_width,
-    const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w,
-    const int pad_h, const int pad_w,
+__global__ void max_pool_backward_nchw(const scalar_t* top_diff,
+    const int64_t* top_mask, const int num, const int64_t channels,
+    const int64_t height, const int64_t width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
    const int dilation_h, const int dilation_w,
    scalar_t* bottom_diff) {
-  CUDA_KERNEL_LOOP_TYPE(index, height*width, index_t) {
-    index_t h = index / width;
-    index_t w = index - h * width;
-    index_t phstart = p_start(h, pad_h, kernel_h, dilation_h, stride_h);
-    index_t phend = p_end(h, pad_h, pooled_height, stride_h);
-    index_t pwstart = p_start(w, pad_w, kernel_w, dilation_w, stride_w);
-    index_t pwend = p_end(w, pad_w, pooled_width, stride_w);
-    for (index_t n = blockIdx.y; n < num; n += gridDim.y) {
-      for (index_t c = blockIdx.z; c < channels; c += gridDim.z) {
+  CUDA_KERNEL_LOOP(index, height*width) {
+    int h = index / width;
+    int w = index - h * width;
+    int phstart = p_start(h, pad_h, kernel_h, dilation_h, stride_h);
+    int phend = p_end(h, pad_h, pooled_height, stride_h);
+    int pwstart = p_start(w, pad_w, kernel_w, dilation_w, stride_w);
+    int pwend = p_end(w, pad_w, pooled_width, stride_w);
+    for (int n = blockIdx.y; n < num; n += gridDim.y) {
+      for (int c = blockIdx.z; c < channels; c+= gridDim.z) {
        accscalar_t gradient = accscalar_t(0);
-        index_t offset = (n * channels + c) * pooled_height * pooled_width;
-        for (index_t ph = phstart; ph < phend; ++ph) {
-          for (index_t pw = pwstart; pw < pwend; ++pw) {
+        int offset = (n * channels + c) * pooled_height * pooled_width;
+        for (int ph = phstart; ph < phend; ++ph) {
+          for (int pw = pwstart; pw < pwend; ++pw) {
            if (top_mask[ph * pooled_width + pw + offset] == h * width + w) {
              gradient += static_cast<accscalar_t>(top_diff[ph * pooled_width + pw + offset]);
            }
@ -498,6 +469,8 @@ const Tensor& indices) {
  const int64_t in_stride_h = input.stride(-2);
  const int64_t in_stride_w = input.stride(-1);

+  const int count = safe_downcast<int, int64_t>(output.numel());
+
  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
    "max_pool2d_with_indices_out_cuda_frame",
    [&] {
@ -580,42 +553,14 @@ const Tensor& indices) {
          break;
        }
        case MemoryFormat::Contiguous: {
-          const int threads = std::min(
-              at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
-              BLOCK_THREADS);
-          const int64_t nthreads = output.numel();
-          bool use_int32 = can_use_int32_nchw(
-              nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth);
-          const int maxGridX = at::cuda::getCurrentDeviceProperties()->maxGridSize[0];
-          const int blocks = static_cast<int>(std::min<int64_t>(
-              ceil_div(nthreads, static_cast<int64_t>(threads)),
-              static_cast<int64_t>(maxGridX)));
-          auto stream = at::cuda::getCurrentCUDAStream();
-          if (use_int32) {
-            max_pool_forward_nchw<scalar_t, int32_t>
-                <<<blocks, threads, 0, stream>>>(
-                    static_cast<int32_t>(nthreads),
-                    input_data,
-                    static_cast<int32_t>(nInputPlane),
-                    static_cast<int32_t>(inputHeight),
-                    static_cast<int32_t>(inputWidth),
-                    static_cast<int32_t>(outputHeight),
-                    static_cast<int32_t>(outputWidth),
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    output_data, indices_data);
-          } else {
-            max_pool_forward_nchw<scalar_t, int64_t>
-                <<<blocks, threads, 0, stream>>>(
-                    nthreads,
-                    input_data,
-                    nInputPlane,
-                    inputHeight,
-                    inputWidth,
-                    outputHeight,
-                    outputWidth,
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    output_data, indices_data);
-          }
+          const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
+                                            BLOCK_THREADS);
+          max_pool_forward_nchw<scalar_t>
+              <<<ceil_div(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+              count, input_data,
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  output_data, indices_data);
          C10_CUDA_KERNEL_LAUNCH_CHECK();
          break;
        }
@ -688,6 +633,8 @@ const Tensor& gradInput) {

  gradInput.zero_();

+  int64_t count = input.numel();
+
  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
    "max_pool2d_with_indices_out_cuda_frame",
    [&] {
@ -745,45 +692,25 @@ const Tensor& gradInput) {
          break;
        }
        case MemoryFormat::Contiguous: {
-          const int threads = std::min(
-              at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
-              BLOCK_THREADS);
-          const int imgcount = inputWidth * inputHeight;
-          const int maxGridX = at::cuda::getCurrentDeviceProperties()->maxGridSize[0];
-          const int maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
-          const int maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2];
-          const int blocks_x = std::min(ceil_div(imgcount, threads), maxGridX);
-          dim3 grid(blocks_x, static_cast<unsigned>(std::min<int64_t>(nbatch, maxGridY)), static_cast<unsigned>(std::min<int64_t>(nInputPlane, maxGridZ)));
-          bool use_int32 = can_use_int32_nchw(
-              nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth);
-          auto stream = at::cuda::getCurrentCUDAStream();
-          if (use_int32) {
-            max_pool_backward_nchw<scalar_t, accscalar_t, int32_t>
-                <<<grid, threads, 0, stream>>>(
-                    gradOutput_data,
-                    indices_data,
-                    static_cast<int32_t>(nbatch),
-                    static_cast<int32_t>(nInputPlane),
-                    static_cast<int32_t>(inputHeight),
-                    static_cast<int32_t>(inputWidth),
-                    static_cast<int32_t>(outputHeight),
-                    static_cast<int32_t>(outputWidth),
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    gradInput_data);
-          } else {
-            max_pool_backward_nchw<scalar_t, accscalar_t, int64_t>
-                <<<grid, threads, 0, stream>>>(
-                    gradOutput_data,
-                    indices_data,
-                    nbatch,
-                    nInputPlane,
-                    inputHeight,
-                    inputWidth,
-                    outputHeight,
-                    outputWidth,
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    gradInput_data);
-          }
+          int imgcount = inputWidth * inputHeight;
+          dim3 grid;
+          const int blocks = (imgcount + BLOCK_THREADS - 1) / BLOCK_THREADS;
+          grid.x = blocks;
+          grid.y = nbatch;
+          uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+          if (maxGridY < grid.y) grid.y = maxGridY;
+          grid.z = nInputPlane;
+          uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2];
+          if (maxGridZ < grid.z) grid.z = maxGridZ;
+
+          max_pool_backward_nchw<scalar_t, accscalar_t>
+          <<<grid, BLOCK_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
+                  gradOutput_data,
+                  indices_data,
+                  nbatch,
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  gradInput_data);
          C10_CUDA_KERNEL_LAUNCH_CHECK();
          break;
        }
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@ -78,18 +78,9 @@ __global__ void EmbeddingBag_updateOutputKernel_max(
      scalar_t weightFeatMax = 0;
      int64_t bag_size_ = 0;
      int64_t maxWord = -1;
-
-      // Separate validation loop reduces register pressure in the main loop below.
-      // No early exit (break) on invalid input as benchmarking shows it degrades performance.
-      bool has_invalid_index = false;
-      for (int64_t emb = begin; emb < end; emb++) {
-        index_t input_idx = input[emb];
-        has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows);
-      }
-      CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)");
-
      for (int64_t emb = begin; emb < end; emb++) {
        bool pad = (input[emb] == padding_idx);
+        CUDA_KERNEL_ASSERT(input[emb] < numRows);
        const int64_t weightRow = input[emb] * weight_stride0;
        scalar_t weightValue = weightFeat[weightRow];
        if (bag_size_ == 0 || weightValue > weightFeatMax) {
@ -138,19 +129,10 @@ __global__ void EmbeddingBag_updateOutputKernel_sum_mean(
      CUDA_KERNEL_ASSERT(end >= begin);
      accscalar_t weightFeatSum = 0;
      int64_t bag_size_ = 0;
-
-      // Separate validation loop reduces register pressure in the main loop below.
-      // No early exit (break) on invalid input as benchmarking shows it degrades performance.
-      bool has_invalid_index = false;
-      for (int64_t emb = begin; emb < end; emb++) {
-        index_t input_idx = input[emb];
-        has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows);
-      }
-      CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)");
-
      for (int64_t emb = begin; emb < end; emb++) {
        index_t input_idx = input[emb];
        bool pad = (input_idx == padding_idx);
+        CUDA_KERNEL_ASSERT(0 <= input_idx && input_idx < numRows);
        const int64_t weightRow = input_idx * weight_stride0;
        scalar_t weightValue = weightFeat[weightRow];
        weightValue = pad ? static_cast<scalar_t>(0) : weightValue;
--- a/aten/src/ATen/native/cuda/GroupedBlas.cpp
+++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp
@ -78,9 +78,9 @@ _mx8_mx8_bf16_grouped_mm_fbgemm(
        const Tensor& mat_a,
        const Tensor& mat_b,
        const Tensor& scale_a,
-        const SwizzleType swizzle_a,
+        const SwizzleType& swizzle_a,
        const Tensor& scale_b,
-        const SwizzleType swizzle_b,
+        const SwizzleType& swizzle_b,
        const std::optional<at::Tensor>& offs,
        Tensor& out) {
    const bool a_is_2d = mat_a.dim() == 2;
@ -607,8 +607,6 @@ _scaled_grouped_mm_cuda_v2(
      // scale shape checks
      _check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
      _check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
-      // swizze checks
-      TORCH_CHECK_VALUE(swizzle_a_enum.size() == 1 && swizzle_b_enum.size() == 1, "Expected single swizzle argument");
      return _mx8_mx8_bf16_grouped_mm_fbgemm(
          mat_a,
          mat_b,
@ -671,12 +669,9 @@ std::optional<c10::ScalarType> out_dtype) {
  // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
  // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
  bool use_fast_path = false;
-  // On non CK system(w/ ROCm), make sure use_fast_path is false
-#if defined(USE_ROCM_CK_GEMM)
  if (at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950"})) {
    use_fast_path = true;
  }
-#endif //USE_ROCM_CK_GEMM
 #endif
  const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
@ -685,11 +680,7 @@ std::optional<c10::ScalarType> out_dtype) {
 #ifndef USE_ROCM
    at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
 #else
-#if defined(USE_ROCM_CK_GEMM)
    at::hip::detail::group_gemm_ck(mat_a, mat_b, offs, bias, out);
-#else
-    TORCH_WARN("ROCm: Group Gemm through CK not selected.");
-#endif //USE_ROCM_CK_GEMM
 #endif
  } else {
    _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
--- a/aten/src/ATen/native/cuda/LogAddExpKernel.cu
+++ b/aten/src/ATen/native/cuda/LogAddExpKernel.cu
@ -2,250 +2,18 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/cuda/JitLoops.cuh>
-#include <ATen/native/cuda/jit_utils.h>
-#include <ATen/native/cuda/ScanUtils.cuh>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/BinaryOps.h>
 #include <ATen/OpMathType.h>
 #include <c10/util/MathConstants.h>
-#include <c10/util/complex.h>
-
-#include <cmath>
-#include <limits>

 // NOTE: CUDA on Windows requires that the enclosing function
 // of a __device__ lambda not have internal linkage.

 namespace at::native {

-// custom min and max to be used in logaddexp for  complex arguments
-template <typename scalar_t, bool min>
-__host__ __device__ c10::complex<scalar_t> _logaddexp_minmax(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
-  scalar_t xr = std::real(x);
-  scalar_t yr = std::real(y);
-  if (::isnan(yr) || (::isnan(std::imag(y)))) {
-    return y;
-  } else if (::isnan(xr) || (::isnan(std::imag(x)))) {
-    return x;
-  } else if (min) { // min
-    return (xr < yr) ? x : y;
-  } else { // max
-    return (xr >= yr) ? x : y;
-  }
-}
-
-template <typename scalar_t>
-__host__ __device__ scalar_t _log_add_exp_helper(const scalar_t& x, const scalar_t& y) {
-  // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
-  // Using the original expression: `at::_isnan(y) ? y : std::min(x, y)` causes an error in ROCM
-  const auto isnan_x = at::_isnan(x);
-  const auto isnan_y = at::_isnan(y);
-  scalar_t min = isnan_y ? y : (isnan_x ? x : std::min(x, y));
-  scalar_t max = isnan_y ? y : (isnan_x ? x : std::max(x, y));
-  if (min != max || ::isfinite(min)) {
-    // nan will be propagated here
-    return ::log1p(std::exp(min - max)) + max;
-  } else {
-    // special case to correctly handle infinite cases
-    return x;
-  }
-}
-
-template <typename scalar_t>
-__host__ __device__ c10::complex<scalar_t> _fast_build_exp(const c10::complex<scalar_t>& x) {
-  // complex exponential function, but implemented manually to get fast compilation time
-  // this function only handles the case where the x is finite (not inf nor nan)
-  const auto xreal = std::real(x);
-  const auto ximag = std::imag(x);
-  const auto exp_x_abs = std::exp(xreal);
-  auto exp_x_real = exp_x_abs * std::cos(ximag);
-  auto exp_x_imag = exp_x_abs * std::sin(ximag);
-  return {exp_x_real, exp_x_imag};
-}
-
-template <typename scalar_t>
-__host__ __device__ c10::complex<scalar_t> _fast_build_exp_inf(const c10::complex<scalar_t>& x) {
-  // complex exponential function, but implemented manually to get fast compilation time
-  // this function only handles the case where the real part of x is infinite
-  const auto ximag = std::imag(x);
-  constexpr auto exp_x_abs = std::numeric_limits<scalar_t>::infinity();
-  if (!::isfinite(ximag)) {  // add this to make consitent with std::exp(x+yi)
-    return {exp_x_abs, std::numeric_limits<scalar_t>::quiet_NaN()};
-  }
-  const auto sin = std::sin(ximag);
-  const auto cos = std::cos(ximag);
-  // special case if the angle is exactly the multiple of pi/2
-  auto exp_x_real = (cos == 0) ? (scalar_t)0.0 : exp_x_abs * cos;
-  auto exp_x_imag = (sin == 0) ? (scalar_t)0.0 : exp_x_abs * sin;
-  return {exp_x_real, exp_x_imag};
-}
-
-template <typename scalar_t>
-__host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
-  c10::complex<scalar_t> min = _logaddexp_minmax<scalar_t, /*min=*/true>(x, y);
-  c10::complex<scalar_t> max = _logaddexp_minmax<scalar_t, /*min=*/false>(x, y);
-  scalar_t min_real = std::real(min);
-  scalar_t max_real = std::real(max);
-
-  if (::isnan(min_real) || ::isnan(std::imag(min))) {
-    // handling the "infectious" NaNs
-    return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
-  }
-  else if ((!::isfinite(min_real)) && (min_real == max_real)) {
-    if (min_real < 0) {
-      // handle the -inf case, the imaginary part here does not really matter as the exp(value)
-      // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
-      // It does not matter if we're taking the exp of this value
-      return min;
-    } else {
-      // handle the +inf case, we don't need the special precision for log1p for small values
-      // and to avoid producing nan in case of real(max) == real(min) == +inf
-      const auto exp_min = _fast_build_exp_inf(min);
-      const auto exp_max = _fast_build_exp_inf(max);
-      return ::log1p(exp_min + exp_max - 1);  // log1p(x - 1) builds faster than log
-    }
-  } else {
-    const auto minmax = min - max;
-    c10::complex<scalar_t> exp_minmax;
-    if (!::isfinite(minmax.real())) {
-        exp_minmax = minmax.real() < 0 ? c10::complex<scalar_t>{0.0, 0.0} : _fast_build_exp_inf(minmax);
-    } else {
-        exp_minmax = _fast_build_exp(minmax);
-    }
-    return ::log1p(exp_minmax) + max;
-  }
-}
-
-// Complex logaddexp jiterator string
-const auto logaddexp_complex_string = jiterator_stringify(
-    template<typename T>
-    std::complex<T> log1p(const std::complex<T>& z)
-    {
-      using complex_t = std::complex<T>;
-      T x = z.real();
-      T y = z.imag();
-      T zabs = abs(z);
-      T theta = atan2(y, x + T(1));
-      if (zabs < 0.5) {
-          T r = x * (T(2) + x) + y * y;
-          if (r == 0) { // handle underflow
-              return complex_t(x, theta);
-          }
-          return complex_t(T(0.5) * std::log1p(r), theta);
-      } else {
-          T z0 = std::hypot(x + 1, y);
-          return complex_t(log(z0), theta);
-      }
-    }
-
-    // separated _logaddexp_minmax into 2 different functions for jiterator_string
-    template <typename T>
-    std::complex<T> logaddexp_min(const std::complex<T>& x, const std::complex<T>& y) {
-        T xr = x.real();
-        T yr = y.real();
-        if (isnan(yr) || isnan(y.imag())) {
-            return y;
-        } else if (isnan(xr) || isnan(x.imag())) {
-            return x;
-        } else {
-            return (xr < yr) ? x : y;
-        }
-    }
-
-    template <typename T>
-    std::complex<T> logaddexp_max(const std::complex<T>& x, const std::complex<T>& y) {
-        T xr = x.real();
-        T yr = y.real();
-        if (isnan(yr) || isnan(y.imag())) {
-            return y;
-        } else if (isnan(xr) || isnan(x.imag())) {
-            return x;
-        } else {
-            return (xr >= yr) ? x : y;
-        }
-    }
-
-    template <typename T>
-    std::complex<T> fast_build_exp(const std::complex<T>& x) {
-        const auto xreal = x.real();
-        const auto ximag = x.imag();
-        const auto exp_x_abs = exp(xreal);
-        auto exp_x_real = exp_x_abs * cos(ximag);
-        auto exp_x_imag = exp_x_abs * sin(ximag);
-        return std::complex<T>(exp_x_real, exp_x_imag);
-    }
-
-    template <typename T>
-    std::complex<T> fast_build_exp_inf(const std::complex<T>& x) {
-        using complex_t = std::complex<T>;
-        const auto ximag = x.imag();
-        const T exp_x_abs = INFINITY;
-        if (!isfinite(ximag)) {
-            return complex_t(exp_x_abs, NAN);
-        }
-        const auto sin_val = sin(ximag);
-        const auto cos_val = cos(ximag);
-        auto exp_x_real = (cos_val == T(0)) ? T(0) : exp_x_abs * cos_val;
-        auto exp_x_imag = (sin_val == T(0)) ? T(0) : exp_x_abs * sin_val;
-        return complex_t(exp_x_real, exp_x_imag);
-    }
-
-    template <typename complex_t>
-    complex_t logaddexp_complex(complex_t x, complex_t y) {
-        using T = typename complex_t::value_type;
-        complex_t min_val = logaddexp_min(x, y);
-        complex_t max_val = logaddexp_max(x, y);
-        T min_real = min_val.real();
-        T max_real = max_val.real();
-
-        if (isnan(min_real) || isnan(min_val.imag())) {
-            return complex_t(NAN, NAN);
-        }
-        else if ((!isfinite(min_real)) && (min_real == max_real)) {
-            if (min_real < T(0)) {
-                return min_val;
-            } else {
-                const auto exp_min = fast_build_exp_inf<T>(min_val);
-                const auto exp_max = fast_build_exp_inf<T>(max_val);
-                return log1p(exp_min + exp_max - complex_t(1, 0));
-            }
-        } else {
-            const auto minmax = min_val - max_val;
-            complex_t exp_minmax;
-            if (!isfinite(minmax.real())) {
-                exp_minmax = (minmax.real() < T(0)) ? complex_t(0, 0) : fast_build_exp_inf<T>(minmax);
-            } else {
-                exp_minmax = fast_build_exp<T>(minmax);
-            }
-            return log1p(exp_minmax) + max_val;
-        }
-    }
-);
-
-constexpr char logaddexp_complex_name[] = "logaddexp_complex";
 void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
-  if (at::isComplexType(iter.dtype())) {
-#if AT_USE_JITERATOR()
-    AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
-      jitted_gpu_kernel<
-          /*name=*/logaddexp_complex_name,
-          /*return_dtype=*/scalar_t,
-          /*common_dtype=*/scalar_t,
-          /*arity=*/2>(iter, logaddexp_complex_string);
-    });
-#else
-    AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
-      using opmath_t = at::opmath_type<scalar_t>;
-      gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t {
-        const auto a = static_cast<opmath_t>(a_);
-        const auto b = static_cast<opmath_t>(b_);
-        return static_cast<scalar_t>(_log_add_exp_helper(a, b));
-      });
-    });
-#endif
-  } else {
-    AT_DISPATCH_FLOATING_TYPES_AND2(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
      ScalarType::BFloat16, ScalarType::Half,
      iter.dtype(), "logaddexp_cuda",
      [&]() {
@ -261,7 +29,6 @@ void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
          }
        });
      });
-  }
 }

 void logaddexp2_kernel_cuda(TensorIteratorBase& iter) {
--- a/aten/src/ATen/native/cuda/Reduce.cu
+++ b/aten/src/ATen/native/cuda/Reduce.cu
@ -11,7 +11,7 @@ static inline std::ostream& operator<<(std::ostream& out, dim3 dim) {
  if (dim.y == 1 && dim.z == 1) {
    out << dim.x;
  } else {
-    out << '[' << dim.x << ',' << dim.y << ',' << dim.z << ']';
+    out << "[" << dim.x << "," << dim.y << "," << dim.z << "]";
  }
  return out;
 }
@ -27,7 +27,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
  out << "input_mult=[";
  for (int i = 0; i < 3; i++) {
    if (i != 0) {
-      out << ',';
+      out << ",";
    }
    out << config.input_mult[i];
  }
@ -35,7 +35,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
  out << "output_mult=[";
  for (int i = 0; i < 2; i++) {
    if (i != 0) {
-      out << ',';
+      out << ",";
    }
    out << config.output_mult[i];
  }
@ -49,7 +49,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
  out << "block=" << config.block() << ", ";
  out << "grid=" << config.grid() << ", ";
  out << "global_memory_size=" << config.global_memory_size();
-  out << ')';
+  out << ")";
  return out;
 }

--- a/aten/src/ATen/native/cuda/ScaledBlas.cpp
+++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp
@ -740,12 +740,7 @@ _scaled_rowwise_rowwise(
  TORCH_CHECK_VALUE(scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat, "scale_a must have ", mat_a.size(0), " Float elements, got ", scale_a.numel())
  TORCH_CHECK_VALUE(scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat, "scale_b must have ", mat_b.size(1), " Float elements, got ", scale_b.numel())

-  // if we have a scale of shape [256, 1] (say), then stride can be [1, 0] - handle this case
-  TORCH_CHECK_VALUE(
-      scale_a.stride(1) == 1 ||
-      scale_a.size(1) == 1,
-      "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1)
-  );
+  TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1));
  TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));

  auto scaling_choice_a = ScalingType::RowWise;
@ -1101,19 +1096,6 @@ _scaled_mxfp8_mxfp8(
  return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
 }

-void
-_check_mxfp4_support() {
-#ifndef USE_ROCM
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  // Only on B200 GPUs
-  TORCH_CHECK_NOT_IMPLEMENTED(
-    // B200 = 10.0, B300 = 10.3
-    dprops->major == 10,
-    "MXFP4 scaling only supported in CUDA for B200/B300"
-  );
-#endif
-}
-

 Tensor&
 _scaled_mxfp4_mxfp4(
@ -1126,7 +1108,6 @@ _scaled_mxfp4_mxfp4(
 #if defined(_WIN32) || (!defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI))
  TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only");
 #else
-  _check_mxfp4_support();
  // Restrictions:
  // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
  TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
--- a/aten/src/ATen/native/cuda/ScaledGroupMM.cu
+++ b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
@ -364,9 +364,9 @@ void f8f8bf16_grouped_gemm_impl_sm90(
  //       reinterpret_cast<ProblemShape::UnderlyingProblemShape*>(
  //           stride_output_h + group_count);

-  //   std::cout << "PTRS " << mat_a.data_ptr() << ' ' << mat_b.data_ptr() << "
+  //   std::cout << "PTRS " << mat_a.data_ptr() << " " << mat_b.data_ptr() << "
  //   "
-  //             << out.data_ptr() << ' ' << scale_a.data_ptr() << ' '
+  //             << out.data_ptr() << " " << scale_a.data_ptr() << " "
  //             << scale_b.data_ptr() << "\n";
  //   for (int i = 0; i < group_count; i++) {
  //     std::cout << "A " << (void*)inputA_ptrs_h[i] << "\n";
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jack Taylor	dd175e6cc5	Fixes	2025-11-17 11:43:15 +00:00
Jack Taylor	dc72c91a69	Update build_triton_whl.py for new devices	2025-11-14 10:55:45 +00:00
Jack Taylor	7bd67717e4	Update build-triton-wheel.yml	2025-11-12 12:33:53 +00:00
Jack Taylor	d289a1d487	Dummy triton change	2025-11-11 16:36:28 +00:00
Jack Taylor	a9c52484ab	Update build-triton-wheel.yml	2025-11-11 13:31:25 +00:00
Jack Taylor	097a242504	Testing rocm_version entry so multiple jobs are kicked off	2025-11-11 12:57:33 +00:00
Jack Taylor	f0d9e2386b	Build triton whl for N, N-1 ROCm version	2025-11-11 06:26:55 -06:00