Fix MPS error message formatting in EmptyTensor.cpp

[MPS] Update OS version in error message
Followup after https://github.com/pytorch/pytorch/pull/159912 Fixes https://github.com/pytorch/pytorch/issues/164943
2025-10-30 19:54:53 +08:00 · 2025-10-08 08:59:04 -07:00 · 2025-10-08 08:58:17 -07:00 · 2025-10-08 15:26:50 +00:00 · 2025-10-08 15:15:45 +00:00 · 2025-10-08 15:10:38 +00:00
723 changed files with 9589 additions and 4817 deletions
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -37,9 +37,9 @@ case ${DOCKER_TAG_PREFIX} in
  rocm*)
    BASE_TARGET=rocm
    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-    # add gfx950 conditionally starting in ROCm 7.0
+    # add gfx950, gfx115x conditionally starting in ROCm 7.0
    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
-        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
+        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
    fi
    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
    ;;
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -344,7 +344,7 @@ docker build \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
-       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx1100}" \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-deb42f2a8e48f5032b4a98ee781a15fa87a157cf
+e0dda9059d082537cee36be6c5e4fe3b18c880c0
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -46,9 +46,9 @@ case ${DOCKER_TAG_PREFIX} in
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950 conditionally starting in ROCm 7.0
+        # add gfx950, gfx115x conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -115,6 +115,9 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
 # cmake-3.28.0 from pip for onnxruntime
 RUN python3 -mpip install cmake==3.28.0

+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
+
 # build onnxruntime 1.21.0 from sources.
 # it is not possible to build it from sources using pip,
 # so just build it from upstream repository.
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -84,9 +84,9 @@ case ${image} in
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950 conditionally starting in ROCm 7.0
+        # add gfx950, gfx115x conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -120,9 +120,8 @@ ninja==1.11.1.4
 numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
-#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
+#Pinned versions: 0.55.2, 0.60.0
 #test that import: test_numba_integration.py
-#For numba issue see https://github.com/pytorch/pytorch/issues/51511
 #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073

 #numpy
@ -242,10 +241,9 @@ pygments==2.15.0
 #Pinned versions: 14.1.0
 #test that import:

-scikit-image==0.19.3 ; python_version < "3.10"
-scikit-image==0.22.0 ; python_version >= "3.10"
+scikit-image==0.22.0
 #Description: image processing routines
-#Pinned versions:
+#Pinned versions: 0.22.0
 #test that import: test_nn.py

 #scikit-learn
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -5,7 +5,7 @@ DESIRED_ROCM ?= 7.0
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
-#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
+#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201

 DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@ -18,7 +18,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 .PHONY: all
 all: magma-rocm70
 all: magma-rocm64
-all: magma-rocm63

 .PHONY:
 clean:
@ -34,8 +33,3 @@ magma-rocm70:
 magma-rocm64: DESIRED_ROCM := 6.4
 magma-rocm64:
 	$(DOCKER_RUN)
-
-.PHONY: magma-rocm63
-magma-rocm63: DESIRED_ROCM := 6.3
-magma-rocm63:
-	$(DOCKER_RUN)
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -67,7 +67,7 @@ fi
 #       wheels with cxx11-abi

 echo "Checking that the gcc ABI is what we expect"
-if [[ "$(uname)" != 'Darwin' &&  "$(uname -m)" != "s390x" ]]; then
+if [[ "$(uname)" != 'Darwin' ]]; then
  # We also check that there are cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -886,7 +886,7 @@ test_inductor_torchbench_smoketest_perf() {
  done

  # Perform some "warm-start" runs for a few huggingface models.
-  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+  for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
    python benchmarks/dynamo/check_accuracy.py \
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,7 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 # TODO: Move this to .ci/docker/requirements-ci.txt
-python -m pip install "psutil==5.9.1" "pynvml==11.4.1" "pytest-shard==0.1.2"
+python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2"

 run_tests() {
    # Run nvidia-smi if available
--- a/.clang-tidy
+++ b/.clang-tidy
@ -66,6 +66,7 @@ readability-simplify-subscript-expr,
 readability-string-compare,
 -readability-redundant-access-specifiers,
 -readability-redundant-control-flow,
+-readability-redundant-inline-specifier,
 '
 HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
 WarningsAsErrors: '*'
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-0fc62aa26a30ed7ca419d285f285cb5ba02c4394
+2a9138a26ee257fef05310ad3fecf7c55fe80d73
--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -18,6 +18,7 @@ class GitHubComment:
    body_text: str
    created_at: str
    author_login: str
+    author_url: Optional[str]
    author_association: str
    editor_login: Optional[str]
    database_id: int
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/test_check_labels.py
+++ b/.github/scripts/test_check_labels.py
@ -38,6 +38,7 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text="mock_body_text",
            created_at="",
            author_login="",
+            author_url=None,
            author_association="",
            editor_login=None,
            database_id=1,
@ -48,6 +49,7 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
            created_at="",
            author_login=BOT_AUTHORS[1],
+            author_url=None,
            author_association="",
            editor_login=None,
            database_id=2,
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -32,6 +32,7 @@ from trymerge import (
    main as trymerge_main,
    MandatoryChecksMissingError,
    MergeRule,
+    PostCommentError,
    RE_GHSTACK_DESC,
    read_merge_rules,
    remove_job_name_suffix,
@ -588,6 +589,23 @@ class TestTryMerge(TestCase):
            self.assertEqual(mock_merge_base, pr.get_merge_base())
            mocked_gh_fetch_merge_base.assert_called_once()

+    def test_app_can_revert(self, *args: Any) -> None:
+        pr = GitHubPR("pytorch", "pytorch", 164660)
+        repo = DummyGitRepo()
+        app_comment_id, impostor_comment_id = 3375785595, 3377647892
+        # Check that app can revert
+        self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id))
+        # But impostor can not
+        self.assertRaises(
+            PostCommentError,
+            lambda: validate_revert(repo, pr, comment_id=impostor_comment_id),
+        )
+        # Despite it's name being the name of the bot
+        self.assertEqual(
+            pr.get_comment_by_id(impostor_comment_id).author_login,
+            "pytorch-auto-revert",
+        )
+

@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -234,6 +234,7 @@ query ($owner: String!, $name: String!, $number: Int!) {
          createdAt
          author {
            login
+            url
          }
          authorAssociation
          editor {
@ -1093,6 +1094,7 @@ class GitHubPR:
            body_text=node["bodyText"],
            created_at=node["createdAt"] if "createdAt" in node else "",
            author_login=node["author"]["login"],
+            author_url=node["author"].get("url", None),
            author_association=node["authorAssociation"],
            editor_login=editor["login"] if editor else None,
            database_id=node["databaseId"],
@ -2029,6 +2031,11 @@ def validate_revert(
    # For some reason, one can not be a member of private repo, only CONTRIBUTOR
    if pr.is_base_repo_private():
        allowed_reverters.append("CONTRIBUTOR")
+    # Special case the pytorch-auto-revert app, whose does not have association
+    # But should be able to issue revert command
+    if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
+        allowed_reverters.append("NONE")
+
    if author_association not in allowed_reverters:
        raise PostCommentError(
            f"Will not revert as @{author_login} is not one of "
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@ -63,6 +63,7 @@ jobs:
      # Same as the build job
      python-version: 3.12.7
      test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
+      timeout-minutes: 300
      disable-monitor: false
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -213,9 +213,9 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
        ]}
    secrets: inherit

--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -127,8 +127,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build with asan
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -59,3 +59,29 @@ jobs:
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
+
+  linux-jammy-rocm-py3_10-gfx1100-test:
+    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3_10-gfx1100
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
+        ]}
+      tests-to-include: >
+         test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
+         test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
+         inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
+         inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
+         inductor/test_flex_attention inductor/test_max_autotune
+    secrets: inherit
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -140,8 +140,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build with asan
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.gitignore
+++ b/.gitignore
@ -88,7 +88,7 @@ torch_compile_debug/
 # Listed manually because some files in this directory are not generated
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
-torch/csrc/api/include/torch/version.h
+torch/headeronly/version.h
 torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -28,7 +28,7 @@ exclude_patterns = [
    'torch/lib/**',
    'venv/**',
    '**/*.pyi',
-    "tools/experimental/dynamic_shapes/torchfuzz/**",
+    "tools/experimental/torchfuzz/**",
    'tools/test/test_selective_build.py',
 ]
 command = [
@ -198,7 +198,7 @@ exclude_patterns = [
    'tools/test/gen_operators_yaml_test.py',
    'tools/test/gen_oplist_test.py',
    'tools/test/test_selective_build.py',
-    'tools/experimental/dynamic_shapes/torchfuzz/**',
+    'tools/experimental/torchfuzz/**',
 ]
 command = [
    'python3',
@ -1573,6 +1573,7 @@ exclude_patterns = [
    'torch/_inductor/fx_passes/serialized_patterns/**',
    'torch/_inductor/autoheuristic/artifacts/**',
    'test/dynamo/cpython/**',
+    'test/test_torchfuzz_repros.py',
    'scripts/**',
    'third_party/**',
    'fb/**',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -13,6 +13,9 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt
 load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
 load("//:tools/bazel.bzl", "rules")

+# Export files for use by torch/headeronly (where version.h generation now lives)
+exports_files(["version.txt"])
+
 define_targets(rules = rules)

 COMMON_COPTS = [
@ -690,7 +693,9 @@ cc_library(
            "torch/csrc/*/generated/*.h",
            "torch/csrc/jit/serialization/mobile_bytecode_generated.h",
        ] + torch_cuda_headers,
-    ) + GENERATED_AUTOGRAD_CPP + [":version_h"],
+    ) + GENERATED_AUTOGRAD_CPP + [
+        "//torch/headeronly:version_h",
+    ],
    includes = [
        "third_party/kineto/libkineto/include",
        "torch/csrc",
--- a/2
+++ b/2
@ -53,7 +53,7 @@ ARG CUDA_PATH=cu121
 ARG INSTALL_CHANNEL=whl/nightly
 # Automatically set by buildx
 # pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
-RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=25.7.0
+RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0

 ARG TARGETPLATFORM

--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -483,8 +483,8 @@ at::BlasBackend Context::blasPreferredBackend() {
 #if ROCM_VERSION >= 60300
          "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
 #endif
-#if ROCM_VERSION >= 60500
-          "gfx950"
+#if ROCM_VERSION >= 70000
+          "gfx950", "gfx1150", "gfx1151"
 #endif
      };
      for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -229,14 +229,14 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
  }

  void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef<int64_t> size) {
-    return _resize_(sparse_dim, dense_dim, size);
+    _resize_(sparse_dim, dense_dim, size);
  }

  void resize_(
      int64_t sparse_dim,
      int64_t dense_dim,
      ArrayRef<c10::SymInt> size) {
-    return _resize_(sparse_dim, dense_dim, size);
+    _resize_(sparse_dim, dense_dim, size);
  }

  // NOTE: this function will resize the sparse tensor and also set `indices`
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@ -59,7 +59,7 @@ static inline void set_item(const Tensor& self, ArrayRef<TensorIndex> indices, c
    }
  }

-  return set_item(self, indices, value);
+  set_item(self, indices, value);
 }

 } // namespace indexing
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -214,7 +214,7 @@ inline Tensor applySlice(
      "step must be greater than zero");

  // See NOTE [nested tensor size for indexing]
-  if (self_sizes.has_value() && self_sizes.value().size() > 0) {
+  if (self_sizes.has_value() && !self_sizes.value().empty()) {
    // Skip this optimization if we are tracing, as the trace may be polymorphic
    // over the shape of the `self` tensor, and we still want to record
    // the slice.
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@ -765,7 +765,8 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
  if (numel == 0) {
    return;
  } else if (numel < grain_size || at::get_num_threads() == 1) {
-    return serial_for_each(loop, {0, numel});
+    serial_for_each(loop, {0, numel});
+    return;
  } else {
    at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
      serial_for_each(loop, {begin, end});
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@ -49,7 +49,7 @@ static void check_unique_names(DimnameList names) {
 }

 void check_names_valid_for(const TensorBase& tensor, DimnameList names) {
-  return impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
+  impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
 }

 void check_names_valid_for(size_t tensor_dim, DimnameList names) {
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -138,7 +138,7 @@ void Tensor::_backward(TensorList inputs,
        const std::optional<Tensor>& gradient,
        std::optional<bool> keep_graph,
        bool create_graph) const {
-  return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
+  impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
 }

 const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
--- a/aten/src/ATen/core/TransformationHelper.h
+++ b/aten/src/ATen/core/TransformationHelper.h
@ -117,7 +117,7 @@ C10_HOST_DEVICE inline T cauchy(T val, T median, T sigma) {
 template <>
 C10_HOST_DEVICE inline double cauchy(double val, double median, double sigma) {
  // https://en.wikipedia.org/wiki/Cauchy_distribution#Cumulative_distribution_function
-  return median + sigma * at::tan(c10::pi<double> * (val - static_cast<double>(0.5)));
+  return median + sigma * at::tan(c10::pi<double> * (val - 0.5));
 }

 /**
--- a/aten/src/ATen/core/boxing/BoxedKernel_impl.h
+++ b/aten/src/ATen/core/boxing/BoxedKernel_impl.h
@ -2,7 +2,7 @@

 namespace c10 {

-inline BoxedKernel::BoxedKernel() : functor_(), boxed_kernel_func_(nullptr) {}
+inline BoxedKernel::BoxedKernel() : boxed_kernel_func_(nullptr) {}

 inline BoxedKernel::BoxedKernel(
    std::unique_ptr<OperatorKernel> functor,
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@ -20,9 +20,7 @@ make_unique_base(Args&&... args) {
 } // namespace detail

 inline KernelFunction::KernelFunction()
-    : boxed_kernel_func_(),
-      unboxed_kernel_func_(nullptr),
-      sym_unboxed_kernel_func_(nullptr) {}
+    : unboxed_kernel_func_(nullptr), sym_unboxed_kernel_func_(nullptr) {}

 inline KernelFunction::~KernelFunction() {
  if (tokens_) {
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -96,7 +96,7 @@ class TORCH_API Dispatcher final {
  friend class TypedOperatorHandle;

  struct Guard final {
-    Guard() : alive(true), mutex() {}
+    Guard() : alive(true) {}
    std::atomic<bool> alive;
    std::mutex mutex;
  };
@ -496,7 +496,7 @@ class TORCH_API OperatorHandle {
  }

  void checkInvariants() const {
-    return operatorDef_->op.checkInvariants();
+    operatorDef_->op.checkInvariants();
  }

  c10::ArrayRef<at::Tag> getTags() const {
@ -932,7 +932,7 @@ inline void Dispatcher::redispatchBoxed(
  }
 #endif
  const auto& kernel = entry.lookup(dispatchKeySet);
-  return kernel.callBoxed(op, dispatchKeySet, stack);
+  kernel.callBoxed(op, dispatchKeySet, stack);
 }

 } // namespace c10
--- a/aten/src/ATen/core/op_registration/op_allowlist.h
+++ b/aten/src/ATen/core/op_registration/op_allowlist.h
@ -114,7 +114,7 @@ constexpr bool allowlist_contains(std::string_view allowlist, std::string_view i
        }
        next++;
      } else {
-        if (allowlist.substr(cur).compare(item) == 0) {
+        if (allowlist.substr(cur) == item) {
          return true;
        }
        break;
--- a/aten/src/ATen/core/op_registration/op_registration.h
+++ b/aten/src/ATen/core/op_registration/op_registration.h
@ -411,7 +411,6 @@ public:

    Options()
    : schemaOrName_(std::nullopt)
-    , kernels()
    , aliasAnalysisKind_(std::nullopt)
    {}

@ -420,7 +419,6 @@ public:
    struct KernelRegistrationConfig final {
      KernelRegistrationConfig()
        : dispatch_key(std::nullopt)
-        , func()
        , cpp_signature(std::nullopt)
        , inferred_function_schema(nullptr)
      {}
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1270,7 +1270,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
  }
 #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-    if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
+    if (at::detail::getCUDAHooks().isGPUArch({"gfx11", "gfx12"})) { //no CK GEMM version
      gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
    } else{
      at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
@ -99,8 +99,8 @@ struct CUDAGeneratorState : public c10::intrusive_ptr_target {
  uint64_t offset_intragraph_;
  bool capturing_{};
  std::unordered_set<cuda::CUDAGraph*> registered_graphs_;
-  at::TensorBase seed_extragraph_{};
-  at::TensorBase offset_extragraph_{};
+  at::TensorBase seed_extragraph_;
+  at::TensorBase offset_extragraph_;

  CUDAGeneratorState(
      uint64_t seed = default_rng_seed_val,
@ -167,7 +167,7 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
  CUDAGeneratorImpl* clone_impl() const override;

  c10::intrusive_ptr<CUDAGeneratorState> state_;
-  std::atomic_flag no_reset_rnn_state_{};
+  std::atomic_flag no_reset_rnn_state_;
 };

 namespace cuda::detail {
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@ -56,7 +56,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {

  // the ID assigned by cuda during graph capture,
  // used to identify when a stream is participating in capture
-  CaptureId_t capture_id_ = -1;
+  CaptureId_t capture_id_ = 0;

  // uuid used to request a particular private mempool from CUDACachingAllocator.
  // By default, this will be set to {id_, 0}.
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -326,6 +326,23 @@ bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
 #endif
 }

+bool CUDAHooks::supportsBFloat16RNNWithCuDNN() const {
+#if AT_CUDNN_ENABLED() && (CUDNN_VERSION >= 91300)
+  if (!hasCUDA()) {
+    return false;
+  }
+  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+  // Check for Volta cores
+  if (prop->major >= 8) {
+    return true;
+  } else {
+    return false;
+  }
+#else
+  return false;
+#endif
+}
+
 long CUDAHooks::versionCuDNN() const {
 #if AT_CUDNN_ENABLED()
  return CUDNN_VERSION;
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -45,6 +45,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  bool supportsDilatedConvolutionWithCuDNN() const override;
  bool supportsDepthwiseConvolutionWithCuDNN() const override;
  bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
+  bool supportsBFloat16RNNWithCuDNN() const override;
  bool hasCUDART() const override;
  long versionCUDART() const override;
  long versionCuDNN() const override;
--- a/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
+++ b/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
@ -122,7 +122,7 @@ struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThread

    // Called by the destructor.  Releases this thread's handles back into the pool.
    void release() {
-        if(my_handles.size() > 0) {
+        if(!my_handles.empty()) {
            auto parent = weak_parent.lock();
            if (!parent) {
                // If this thread exits after atexit handlers have completed, the
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -166,6 +166,10 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    return false;
  }

+  virtual bool supportsBFloat16RNNWithCuDNN() const {
+    return false;
+  }
+
  virtual long versionCuDNN() const {
    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
  }
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@ -465,11 +465,11 @@ static void dynamicLayerBack(const c10::OperatorHandle& op, torch::jit::Stack* s

 // used for functions that have aliasing operations but should be treated like they're out of place (i.e. lift_fresh)
 static void dynamicLayerBackGradSpecialCase(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  return dynamicLayerBack(op, stack, true);
+  dynamicLayerBack(op, stack, true);
 }

 static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  return dynamicLayerBack(op, stack, false);
+  dynamicLayerBack(op, stack, false);
 }

 TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerFrontMode, m) {
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@ -12,7 +12,7 @@

 #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
 #define MPS_ERROR_RUNTIME_TOO_LOW \
-  "The MPS backend is supported on MacOS 13.0+.", \
+  "The MPS backend is supported on MacOS 14.0+. ", \
  "Current OS version can be queried using `sw_vers`"
 #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
  "as the MPS framework doesn't support float64. Please use float32 instead."
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -375,7 +375,7 @@ static void bf16_gemv_trans(
  const at::BFloat16 beta,
  at::BFloat16* y,
  const int incy) {
-  return bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
+  bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }

 template <>
--- a/aten/src/ATen/native/BucketizationUtils.h
+++ b/aten/src/ATen/native/BucketizationUtils.h
@ -70,7 +70,7 @@ inline void searchsorted_maybe_trim_input_tensors(
    const Tensor& raw_boundaries) {
  Tensor trimmed_sorter;
  Tensor raw_sorter;
-  return searchsorted_maybe_trim_input_tensors(
+  searchsorted_maybe_trim_input_tensors(
      trimmed_input,
      trimmed_boundaries,
      trimmed_sorter,
--- a/aten/src/ATen/native/GridSamplerUtils.h
+++ b/aten/src/ATen/native/GridSamplerUtils.h
@ -93,6 +93,12 @@ inline bool cond_cudnn_grid_sampler(
  const TensorBase& input,
  const TensorBase& grid
 ) {
+  auto st = input.scalar_type();
+  if (!(st == kDouble || st == kFloat || st == kHalf))
+    return false;
+  st = grid.scalar_type();
+  if (!(st == kDouble || st == kFloat || st == kHalf))
+    return false;
  return (
    at::native::cudnn_is_acceptable(input) &&
    at::native::cudnn_is_acceptable(grid) &&
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -108,6 +108,13 @@ bool use_mkldnn(const Tensor& input, TensorList params, TensorList hx) {
  return false;
 }

+bool use_cudnn(const Tensor& t) {
+  bool acceptable = at::cudnn_is_acceptable(t);
+  auto st = t.scalar_type();
+  bool bfloat16_cond = st == kBFloat16 && at::detail::getCUDAHooks().supportsBFloat16RNNWithCuDNN();
+  return acceptable && (bfloat16_cond || st == kDouble || st == kFloat || st == kHalf);
+}
+
 template<typename T>
 using pair_of = std::pair<T, T>;

@ -1200,7 +1207,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
      bool train,                                                           \
      bool bidirectional,                                                   \
      bool batch_first) {                                                   \
-    if (at::cudnn_is_acceptable(_input)) {                                  \
+    if (use_cudnn(_input)) {                                                \
      Tensor output, hy;                                                    \
      NAME##_cudnn_stub(                                                    \
          _input.device().type(),                                           \
@ -1262,7 +1269,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
      double dropout_p,                                                     \
      bool train,                                                           \
      bool bidirectional) {                                                 \
-    if (at::cudnn_is_acceptable(data)) {                                    \
+    if (use_cudnn(data)) {                                                  \
      Tensor output, hy;                                                    \
      NAME##_packed_cudnn_stub(                                             \
          data.device().type(),                                             \
@ -1430,7 +1437,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
      TensorList _params, bool has_biases,
      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (at::cudnn_is_acceptable(_input)) {
+  if (use_cudnn(_input)) {
    Tensor output, hy, cy;
    lstm_cudnn_stub(_input.device().type(), output, hy, cy, _input, hx, _params, has_biases,
            num_layers, dropout_p, train, bidirectional, batch_first);
@ -1491,7 +1498,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
      TensorList _params, bool has_biases,
      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (at::cudnn_is_acceptable(data)) {
+  if (use_cudnn(data)) {
    Tensor output, hy, cy;
    lstm_packed_cudnn_stub(data.device().type(), output, hy, cy, data, batch_sizes, hx,
            _params, has_biases, num_layers, dropout_p, train, bidirectional);
--- a/aten/src/ATen/native/RangeUtils.h
+++ b/aten/src/ATen/native/RangeUtils.h
@ -47,7 +47,7 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar
    int64_t sgn = (xstep > 0) - (xstep < 0);
    size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
  } else {
-    size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
+    size_d = std::ceil((end.to<double>() - start.to<double>())
                        / step.to<double>());
  }

--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@ -23,14 +23,6 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
-#include <ATen/ops/_cast_Byte_native.h>
-#include <ATen/ops/_cast_Char_native.h>
-#include <ATen/ops/_cast_Double_native.h>
-#include <ATen/ops/_cast_Float_native.h>
-#include <ATen/ops/_cast_Half_native.h>
-#include <ATen/ops/_cast_Int_native.h>
-#include <ATen/ops/_cast_Long_native.h>
-#include <ATen/ops/_cast_Short_native.h>
 #include <ATen/ops/_dim_arange_native.h>
 #include <ATen/ops/_efficientzerotensor_native.h>
 #include <ATen/ops/_empty_affine_quantized.h>
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@ -91,9 +91,6 @@ bool cudnn_is_acceptable(const TensorBase& self) {
    return false;
  if (!self.is_cuda())
    return false;
-  auto st = self.scalar_type();
-  if (!(st == kDouble || st == kFloat || st == kHalf))
-    return false;
  if (!detail::getCUDAHooks().compiledWithCuDNN())
    return false;
  // cuDNN functions like grid_sampler returns CUDNN_STATUS_BAD_PARAM on empty
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@ -4,7 +4,6 @@

 #include <ATen/OpMathType.h>
 #include <ATen/TensorUtils.h>
-#include <ATen/OpMathType.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
--- a/aten/src/ATen/native/VariableMethodStubs.cpp
+++ b/aten/src/ATen/native/VariableMethodStubs.cpp
@ -25,11 +25,11 @@
 namespace at::native {

 void _backward(const Tensor& self, TensorList inputs, const std::optional<Tensor>& gradient_opt, std::optional<bool> keep_graph, bool create_graph) {
-  return self._backward(inputs, gradient_opt, keep_graph, create_graph);
+  self._backward(inputs, gradient_opt, keep_graph, create_graph);
 }

 void set_data(Tensor& self, const Tensor& new_data) {
-  return self.set_data(new_data);
+  self.set_data(new_data);
 }

 Tensor data(const Tensor& self) {
@ -54,7 +54,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) {
 }

 void retain_grad(Tensor& self) {
-  return self.retain_grad();
+  self.retain_grad();
 }

 bool retains_grad(const Tensor& self) {
--- a/aten/src/ATen/native/cpu/AtomicAddFloat.h
+++ b/aten/src/ATen/native/cpu/AtomicAddFloat.h
@ -22,7 +22,7 @@ static inline void cpu_atomic_add_float(float* dst, float fvalue)
  old_value.floatV = *dst;
  new_value.floatV = old_value.floatV + fvalue;

-  unsigned* old_intV = (unsigned*)(&old_value.intV);
+  unsigned* old_intV = &old_value.intV;
  while (!std::atomic_compare_exchange_strong(dst_intV, old_intV, new_value.intV)) {
 #ifdef __aarch64__
    __asm__ __volatile__("yield;" : : : "memory");
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@ -300,7 +300,8 @@ void div_floor_kernel(TensorIteratorBase& iter) {
    // In the special case of unsigned integer division, floor division is
    // equivalent to truncation division (since the signs of the divisor and
    // dividend are always the same)
-    return div_trunc_kernel(iter);
+    div_trunc_kernel(iter);
+    return;
  } else if (isIntegralType(dtype, /*includeBool*/ false)) {
    // There's no SIMD integer division, so don't try to vectorize it.
    AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cpu", [&]() {
--- a/aten/src/ATen/native/cpu/BlasKernel.cpp
+++ b/aten/src/ATen/native/cpu/BlasKernel.cpp
@ -118,7 +118,7 @@ gemm_notrans_(
  scale_(m, n, beta, c, ldc);

  // c += alpha * (a @ b)
-  const uint64_t unsigned_m = static_cast<int64_t>(m);
+  const uint64_t unsigned_m = m;
  const uint64_t i_m = unsigned_m / 4;
  for (const uint64_t l : c10::irange(k)) {
    for (const uint64_t j : c10::irange(n)) {
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@ -749,21 +749,29 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
        // });

        if (iter_dtype == kByte) {
-          return cpu_hflip_vec<uint8_t>(iter);
+          cpu_hflip_vec<uint8_t>(iter);
+          return;
        } else if (iter_dtype == kChar) {
-          return cpu_hflip_vec<int8_t>(iter);
+          cpu_hflip_vec<int8_t>(iter);
+          return;
        } else if (iter_dtype == kInt) {
-          return cpu_hflip_vec<int32_t>(iter);
+          cpu_hflip_vec<int32_t>(iter);
+          return;
        } else if (iter_dtype == kLong) {
-          return cpu_hflip_vec<int64_t>(iter);
+          cpu_hflip_vec<int64_t>(iter);
+          return;
        } else if (iter_dtype == kShort) {
-          return cpu_hflip_vec<int16_t>(iter);
+          cpu_hflip_vec<int16_t>(iter);
+          return;
        } else if (iter_dtype == kBool) {
-          return cpu_hflip_vec<bool>(iter);
+          cpu_hflip_vec<bool>(iter);
+          return;
        } else if (iter_dtype == kFloat) {
-          return cpu_hflip_vec<float>(iter);
+          cpu_hflip_vec<float>(iter);
+          return;
        } else if (iter_dtype == kDouble) {
-          return cpu_hflip_vec<double>(iter);
+          cpu_hflip_vec<double>(iter);
+          return;
        }
      }
      // other dtypes (float16, bfloat16, complex) are handled by cpu_kernel_vec (see below)
@ -778,10 +786,12 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
          c == input_strides_2[1] &&
          c == iter.element_size(0) * iter.shape()[0]  // checks if dim=1 is contiguous as well
      ) {
-        return cpu_hflip_channels_last_vec(iter);
+        cpu_hflip_channels_last_vec(iter);
+        return;
      }
      // Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec)
-      return cpu_vflip_memcpy(iter);
+      cpu_vflip_memcpy(iter);
+      return;
    }

    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(), "flip_cpu",
--- a/aten/src/ATen/native/cpu/PowKernel.cpp
+++ b/aten/src/ATen/native/cpu/PowKernel.cpp
@ -96,11 +96,14 @@ static void pow_tensor_scalar_kernel(
      dtype == kBFloat16 || isComplexType(dtype)) {
    // Dispatch to fast specialization for sqrt, rsqrt and reciprocal
    if (exp_scalar.equal(.5)) {
-      return sqrt_kernel(iter);
+      sqrt_kernel(iter);
+      return;
    } else if (exp_scalar.equal(-0.5)) {
-      return rsqrt_kernel(iter);
+      rsqrt_kernel(iter);
+      return;
    } else if (exp_scalar.equal(-1.0)) {
-      return reciprocal_kernel(iter);
+      reciprocal_kernel(iter);
+      return;
    }
  }

--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -256,10 +256,10 @@ static void norm_kernel_tensor_iterator_impl(
  } else {
    if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) {
      // type promotion that does cast and reduction in a single kernel
-      return norm_kernel_cpu_impl<at::Half, float>(iter, val);
+      norm_kernel_cpu_impl<at::Half, float>(iter, val); return;
    } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
      // type promotion that does cast and reduction in a single kernel
-      return norm_kernel_cpu_impl<at::BFloat16, float>(iter, val);
+      norm_kernel_cpu_impl<at::BFloat16, float>(iter, val); return;
    }

    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kComplexHalf, iter.input_dtype(), "norm_cpu", [&] {
--- a/aten/src/ATen/native/cpu/ReduceUtils.h
+++ b/aten/src/ATen/native/cpu/ReduceUtils.h
@ -8,7 +8,6 @@
 #include <c10/util/irange.h>
 #include <ATen/OpMathType.h>
 #include <ATen/native/cpu/utils.h>
-#include <ATen/OpMathType.h>

 namespace at::native {
 inline namespace CPU_CAPABILITY {
--- a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
@ -428,10 +428,11 @@ void fp16_gemv_trans(
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0);
 #if !defined(__aarch64__) || defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
  if (at::globalContext().allowFP16ReductionCPU()) {
-    return fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+    fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+    return;
  }
 #endif
-  return fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+  fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
 }

 float bf16_dot_with_fp32_arith(const at::BFloat16* vec1, const at::BFloat16* vec2, int64_t len) {
@ -465,7 +466,7 @@ void bf16_gemv_trans(
  at::BFloat16* y,
  const int incy) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0 && beta == 0.0);
-  return bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
+  bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
 }

 float fp16_dot(
--- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@ -17,7 +17,6 @@
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <c10/util/irange.h>
-#include <ATen/OpMathType.h>

 // [Note AVX-SSE transitions] In general we avoid calls into cmath for code
 // compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in
--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
+++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
@ -240,7 +240,7 @@ static void unfolded2d_copy(
    int64_t output_height,
    int64_t output_width) {
  at::parallel_for(
-      0, (int64_t)n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) {
+      0, n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) {
        for (const auto k : c10::irange(start, end)) {
          int64_t nip = k / (kH * kW);
          int64_t rest = k % (kH * kW);
@ -316,7 +316,7 @@ static void unfolded2d_copy(
                for (int64_t x = 0; x < output_width; x++)
                  memcpy(
                      dst + (size_t)y * output_width + x,
-                      src + (size_t)iy * input_width + ix + (int64_t)x * dW,
+                      src + (size_t)iy * input_width + ix + x * dW,
                      sizeof(scalar_t) * (1));
              }
            }
--- a/aten/src/ATen/native/cpu/int4mm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
@ -906,7 +906,7 @@ static void ref_dyn_quant_matmul_4bit_channelwise_kernel(
          // Round to nearest integer
          const int32_t nudged_zero_point0 = lrintf(zero_point0);

-          int8_t* dst_ptr = (int8_t*)lhs_qa8dx + m_idx * dst_stride;
+          int8_t* dst_ptr = lhs_qa8dx + m_idx * dst_stride;

          // LHS offset at the beginning of the row
          *((float*)(dst_ptr)) = recip_scale0;
@ -1048,7 +1048,7 @@ static void ref_dyn_quant_matmul_4bit_groupwise_kernel(
      zero_point0 = (std::min)(zero_point0, qmax);
      const int32_t nudged_zero_point0 = lrintf(zero_point0);

-      int8_t* dst_ptr = (int8_t*)lhs_qa8dx + row_idx * dst_stride;
+      int8_t* dst_ptr = lhs_qa8dx + row_idx * dst_stride;

      *((float*)(dst_ptr)) = recip_scale0;
      dst_ptr += sizeof(float);
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -285,8 +285,8 @@ static bool isSupportedHipLtROCmArch(int index) {
 #if ROCM_VERSION >= 60300
        "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
 #endif
-#if ROCM_VERSION >= 60500
-        "gfx950"
+#if ROCM_VERSION >= 70000
+        "gfx950", "gfx1150", "gfx1151"
 #endif
    };
    return at::detail::getCUDAHooks().isGPUArch(archs, index);
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@ -121,7 +121,7 @@ void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_si
    "cufft_set_plan_cache_max_size: expected 0 <= device_index < ",
    at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
    device_index);
-  return cufft_get_plan_cache(device_index).resize(max_size);
+  cufft_get_plan_cache(device_index).resize(max_size);
 }

 int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index) {
@ -137,7 +137,7 @@ void cufft_clear_plan_cache_impl(DeviceIndex device_index) {
    "cufft_clear_plan_cache: expected 0 <= device_index < ",
    at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
    device_index);
-  return cufft_get_plan_cache(device_index).clear();
+  cufft_get_plan_cache(device_index).clear();
 }

 } // namespace at::native::detail
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@ -230,7 +230,7 @@ constexpr int BLOCK_THREADS = 256;
 constexpr int RADIX_BITS = 8;
 constexpr int RADIX_DIGITS = 1 << RADIX_BITS; // 2 ^ RADIX_BITS
 constexpr int RADIX_MASK = (RADIX_DIGITS - 1);
-static_assert(RADIX_DIGITS <= BLOCK_THREADS, "radixFindKthValues kernel requires RADIX_DIGITS <= BLOCK_THREADS");
+static_assert(RADIX_DIGITS <= BLOCK_THREADS, "RADIX_DIGITS must be <= BLOCK_THREADS");
 constexpr int MIN_ITEMS_PER_THREAD = 4;
 constexpr int MAX_ITEMS_PER_THREAD = 64;

@ -242,11 +242,10 @@ __global__ void fill(T* x, T value, IndexType size) {
  }
 }

-// find the kth smallest value,
-// for largest topk, k_to_find = slice_size - k + 1
+// compute local histogram for each block
 template <typename T, typename IndexType, typename Bitwise, int Dim>
 C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
-__global__ void radixFindKthValues(
+__global__ void computeBlockDigitCounts(
    at::cuda::detail::TensorInfo<const T, IndexType> input,
    uint32_t slice_size,
    uint32_t* ks_to_find,  // size: num_slices, unused arg but for mysterious reasons perf is better when it's present
@ -321,12 +320,51 @@ __global__ void radixFindKthValues(
  }
 }

+// compute global histogram and cumsum for each row
+__global__ void computeDigitCumSum(
+  short* counts,
+  uint32_t* digit_cum_sum,
+  uint32_t blocks_per_slice) {
+  int tidx = threadIdx.x + blockIdx.x * blockDim.x;
+  int digit_idx = threadIdx.x;
+  uint32_t slice_idx = blockIdx.x;
+
+  typedef cub::BlockScan<uint32_t, RADIX_DIGITS> BlockScan;
+  __shared__ typename BlockScan::TempStorage scan_storage;
+  // accumulates counters from multiple blocks
+  uint32_t digit_count = 0;
+  if (threadIdx.x < RADIX_DIGITS) {
+    constexpr int HISTO_ACCUM_TILE = 4;
+    uint32_t rounds = blocks_per_slice / HISTO_ACCUM_TILE;
+    for (int iter = 0; iter < rounds; iter++)  {
+      int base = HISTO_ACCUM_TILE * iter;
+      #pragma unroll
+      for (int j = 0; j < HISTO_ACCUM_TILE; j++) {
+        int blk = base + j;
+        digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + digit_idx];
+      }
+    }
+    for (int blk = HISTO_ACCUM_TILE * rounds; blk < blocks_per_slice; blk++)  {
+      digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + digit_idx];
+    }
+
+  }
+  // compute the block-wide inclusive prefix sum
+  uint32_t digit_count_cumsum;
+  BlockScan(scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
+  __syncthreads();
+  if (threadIdx.x < RADIX_DIGITS) {
+    digit_cum_sum[tidx] = digit_count_cumsum;
+  }
+}
+
 // Assumption: k can not be larger than UINT32_MAX
 template <typename Bitwise, typename T>
 C10_LAUNCH_BOUNDS_1(RADIX_DIGITS)  // one thread per digit
 __global__ void computeBlockwiseWithinKCounts(
  Bitwise* desires_in,          // size: num_slices
  short* counts,             // size: num_slices * blocks_per_slice * radix_digits
+  uint32_t* digit_cum_sum,
  uint32_t* ks_to_find_in,  // size: num_slices
  uint32_t blocks_per_slice,
  int current_bit,
@ -338,7 +376,7 @@ __global__ void computeBlockwiseWithinKCounts(
  Bitwise* desires_out,
  uint32_t num_blocks
 ) {
-  // This kernel should be launched with the same number of blocks as the `radixFindKthValues` kernel.
+  // This kernel should be launched with the same number of blocks as the `computeBlockDigitCounts` kernel.
  int tidx = threadIdx.x;
  uint32_t block_idx = getLinearBlockId<uint32_t>();
  uint32_t slice_idx = block_idx / blocks_per_slice;
@ -351,36 +389,15 @@ __global__ void computeBlockwiseWithinKCounts(
  if (block_idx >= num_blocks) {
    return;
  }
-  typedef cub::BlockScan<uint32_t, BLOCK_THREADS> BlockScan;
-  union __align__(16) TempStorage {
-    uint32_t digit_count_cumsum[RADIX_DIGITS]; // only used if this it the last block for this slice
-    typename BlockScan::TempStorage scan_storage;
-  };
-  __shared__ TempStorage temp_storage;

-  // accumulates counters from multiple blocks
-  uint32_t digit_count = 0;
-  if (tidx < RADIX_DIGITS) {
-    for (int blk = 0; blk < blocks_per_slice; ++blk) {
-      digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + tidx];
-    }
-  }
-
-  // compute the block-wide inclusive prefix sum
-  uint32_t digit_count_cumsum;
-  BlockScan(temp_storage.scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
-  __syncthreads();
-  // every thread also need the perfix_sum of it's left value for comparison, so save a copy in shared mem
-  if (tidx < RADIX_DIGITS) {
-    temp_storage.digit_count_cumsum[tidx] = digit_count_cumsum;
-  }
-  __syncthreads();

  __shared__ Bitwise desired;
  uint32_t k_to_find = ks_to_find_in[slice_idx];

  if (tidx < RADIX_DIGITS) {
-    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : temp_storage.digit_count_cumsum[tidx - 1];
+    uint32_t position = slice_idx * RADIX_DIGITS + tidx;
+    uint32_t digit_count_cumsum = digit_cum_sum[position];
+    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : digit_cum_sum[position - 1];

    // if not the last pass: update desired and ks_to_find
    // if last pass: write out the kth value
@ -466,7 +483,7 @@ template <typename Bitwise>
 __global__ void computeBlockwiseKthCounts(
  Bitwise* desires,            // size: num_slices
  short* counts,               // size: num_slices * blocks_per_slice * radix_digits
-  uint32_t num_blocks,         // the number of blocks used by `radixFindKthValues` kernel
+  uint32_t num_blocks,         // the number of blocks used by `computeBlockDigitCounts` kernel
  uint32_t blocks_per_slice,
  // outputs:
  uint32_t* kthCounts          // size: num_slices * blocks_per_slice == num_blocks
@ -649,9 +666,7 @@ void launch(
  T* kthValues = reinterpret_cast<T*>(kthValues_buffer.get());

  TORCH_CHECK(blocks_per_slice <= std::numeric_limits<uint32_t>::max(), "blocks_per_slice larger than uint32 maximum is not supported");
-  auto semaphores_buffer = allocator.allocate(numInputSlices * sizeof(uint32_t));
-  uint32_t* semaphores = reinterpret_cast<uint32_t*>(semaphores_buffer.get());
-  AT_CUDA_CHECK(cudaMemsetAsync(semaphores, 0, numInputSlices * sizeof(uint32_t), stream));
+

  auto ks_to_find_buffer = allocator.allocate(2 * numInputSlices * sizeof(uint32_t));
  uint32_t* ks_to_find = reinterpret_cast<uint32_t*>(ks_to_find_buffer.get());
@ -668,6 +683,10 @@ void launch(
  static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits<short>::max(),
    "blockwise counter too large");

+  auto digit_cum_sum_buffer = allocator.allocate(numInputSlices * RADIX_DIGITS * sizeof(uint32_t));
+  uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get());
+  AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream));
+
 #if CUB_SUPPORTS_SCAN_BY_KEY()
  auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
  uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
@ -691,7 +710,7 @@ void launch(

  // iterate radix bits for multiple passes
  for (int current_bit = sizeof(T) * 8 - RADIX_BITS; current_bit >= 0; current_bit -= RADIX_BITS) {
-    radixFindKthValues<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
+    computeBlockDigitCounts<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
        input,
        inputSliceSize,
        ks_to_find_in, // unused arg
@ -704,10 +723,14 @@ void launch(
        desired_in,
        counts);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+    computeDigitCumSum<<<numInputSlices, RADIX_DIGITS, 0, stream>>>(counts, digit_cum_sum, blocks_per_slice);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
    // we unconditionally call this kernel to update desired/ks_to_find/kthValues
    // if cub supports scan_by_key we additionally do k counts
    computeBlockwiseWithinKCounts<Bitwise, T><<<grid, RADIX_DIGITS, 0, stream>>>(
-      desired_in, counts, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
+      desired_in, counts, digit_cum_sum, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    // swap desired/ks_to_find in and out for next iter
    auto tmp_desired = desired_in;
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@ -1107,10 +1107,14 @@ void ldl_factor_kernel(
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Cusolver:
-      return ldl_factor_cusolver(
+       { ldl_factor_cusolver(
          LD, pivots, info, upper, hermitian);
+        return;
+}
    case at::LinalgBackend::Magma:
-      return ldl_factor_magma(LD, pivots, info, upper, hermitian);
+       { ldl_factor_magma(LD, pivots, info, upper, hermitian);
+        return;
+}
    default:
    // By default use cusolver if available and magma otherwise.
    // If cusolver and magma 2.5.4+ are both available and hermitian=true,
@ -1122,8 +1126,10 @@ void ldl_factor_kernel(
            LD, pivots, info, upper, hermitian);
      }
 #endif
-      return ldl_factor_cusolver(
-          LD, pivots, info, upper, hermitian);
+    { ldl_factor_cusolver(
+      LD, pivots, info, upper, hermitian);
+      return;
+    }
 #else
      return ldl_factor_magma(LD, pivots, info, upper, hermitian);
 #endif
@ -1839,11 +1845,14 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
      // For the benchmarks see
      // https://github.com/pytorch/pytorch/pull/56253#discussion_r622851107
      if (input.size(-2) <= 256 && batchCount(input) >= std::max<int64_t>(2, input.size(-2) / 16)) {
-        return geqrf_batched_cublas(input, tau);
+        geqrf_batched_cublas(input, tau);
+        return;
      } else {
-        return geqrf_cusolver(input, tau);
+        geqrf_cusolver(input, tau);
+        return;
      }
-      return geqrf_batched_cublas(input, tau);
+      geqrf_batched_cublas(input, tau);
+      return;
  };

  auto preferred_backend = at::globalContext().linalgPreferredBackend();
@ -1856,10 +1865,14 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
  // - ?geqrf_gpu allows fast computation of Q via ?orgqr_gpu, but doesn't give R properly.
  // - ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orgqr_gpu
    case at::LinalgBackend::Magma:
-      return geqrf_magma(input, tau);
+      { geqrf_magma(input, tau);
+        return;
+      }
    case at::LinalgBackend::Cusolver:
    default:
-      return geqrf_cusolver_backend(input, tau);
+      { geqrf_cusolver_backend(input, tau);
+        return;
+      }
  }
 #else
  return geqrf_magma(input, tau);
@ -2703,13 +2716,17 @@ void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Magma:
-      return gels_magma(a, b, infos);
+      { gels_magma(a, b, infos);
+        return;
+      }
    case at::LinalgBackend::Cusolver:
    default:
      // linalg_lstsq_gels is a generic function that is implemented using
      // geqrf_stub, ormqr_stub, and triangular_solve_stub
      // It dispatches to cuSOLVER for CUDA inputs if USE_LINALG_SOLVER is defined
-      return linalg_lstsq_gels(a, b, infos);
+      { linalg_lstsq_gels(a, b, infos);
+        return;
+      }
  }
 #else
  return gels_magma(a, b, infos);
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@ -1222,7 +1222,7 @@ cudnnRNNAlgo_t get_algo(
 }

 cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
-  if (dtype == CUDNN_DATA_HALF) {
+  if (dtype == CUDNN_DATA_HALF || dtype == CUDNN_DATA_BFLOAT16) {
    return CUDNN_DATA_FLOAT;
  }
  return dtype;
--- a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
@ -772,13 +772,21 @@ void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {

 template <>
 void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  std::string_view arch(dprops->gcnArchName);
-  if (arch == "gfx1100") {
+  static const std::vector<std::string> wmma_archs = {
+    "gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
+#if ROCM_VERSION >= 70000
+    "gfx1150", "gfx1151"
+#endif
+  };
+  if (at::detail::getCUDAHooks().isGPUArch(wmma_archs)) {
    dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGS(at::BFloat16));
-  } else{
+  }
+  else if (at::detail::getCUDAHooks().isGPUArch({"gfx9"})) {
    dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
  }
+  else {
+    TORCH_CHECK(false, "gemm_internal_ck<at::BFloat16> unsupported gfx arch");
+  }
 }

 } // namespace at::native
--- a/aten/src/ATen/native/hip/ck_gemm_half.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_half.hip
@ -599,11 +599,21 @@ void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {

 template <>
 void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
-  if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) {
+  static const std::vector<std::string> wmma_archs = {
+    "gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
+#if ROCM_VERSION >= 70000
+    "gfx1150", "gfx1151"
+#endif
+  };
+  if (at::detail::getCUDAHooks().isGPUArch(wmma_archs)) {
    dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGS(at::Half));
-  } else{
+  }
+  else if (at::detail::getCUDAHooks().isGPUArch({"gfx9"})) {
    dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
  }
+  else {
+    TORCH_CHECK(false, "gemm_internal_ck<at::Half> unsupported gfx arch");
+  }
 }

 } // namespace at::native
--- a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
@ -373,59 +373,67 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return addmm_dense_result(
+        addmm_dense_result(
            mat2.transpose(0, 1).to_sparse_csr(),
            mat1.transpose(0, 1),
            beta,
            alpha,
            result.transpose(0, 1));
+            return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(
+        addmm_dense_result(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
            result.transpose(-2, -1));
+            return;
      }
    }
    if (mat2.layout() == kSparseBsc) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(
+        addmm_dense_result(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
            result.transpose(-2, -1));
+            return;
      }
    }
  }
  if (mat1.layout() == kSparseCsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_dense_result(mat1, mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
-        return addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
+        return;
      }
      if (result.layout() == kSparseCsr) {
-        return addmm_sparse_result(mat1, mat2, beta, alpha, result);
+        addmm_sparse_result(mat1, mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
        // TODO: CSR @ CSC kernel would be very fast due to format alignment
-        return addmm_sparse_input_dense_result(
-            mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_input_dense_result(
+          mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        return;
      }
      if (result.layout() == kSparseCsr) {
        // TODO: CSR @ CSC kernel would be very fast due to format alignment
-        return addmm_sparse_result(
-            mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_result(
+          mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        return;
      }
    }
  }
@ -433,56 +441,62 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
        // TODO: avoid csc->csr conversion with native csc support
-        return addmm_dense_result(
-            mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        addmm_dense_result(
+          mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr) {
        // TODO: avoid csc->csr conversion with native csc support
-        return addmm_sparse_result(
-            mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        addmm_sparse_result(
+          mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        return addmm_sparse_input_dense_result(
-            mat2.transpose(-2, -1),
-            mat1.transpose(-2, -1),
-            beta,
-            alpha,
-            result.transpose(-2, -1));
+        addmm_sparse_input_dense_result(
+          mat2.transpose(-2, -1),
+          mat1.transpose(-2, -1),
+          beta,
+          alpha,
+          result.transpose(-2, -1));
+        return;
      }
      if (result.layout() == kSparseCsr) {
        // TODO avoid csc->csr
-        return addmm_sparse_result(
-            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_result(
+          mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        return;
      }
      if (result.layout() == kSparseCsc) {
-        return addmm_sparse_result(
-            mat2.transpose(-2, -1),
-            mat1.transpose(-2, -1),
-            beta,
-            alpha,
-            result.transpose(-2, -1));
+        addmm_sparse_result(
+          mat2.transpose(-2, -1),
+          mat1.transpose(-2, -1),
+          beta,
+          alpha,
+          result.transpose(-2, -1));
+        return;
      }
    }
  }
  if (mat1.layout() == kSparseBsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_dense_result(mat1, mat2, beta, alpha, result);
+        return;
      }
    }
  }
  TORCH_CHECK(
-      false,
-      "addmm: computation on CPU is not implemented for ",
-      result.layout(),
-      " + ",
-      mat1.layout(),
-      " @ ",
-      mat2.layout());
+    false,
+    "addmm: computation on CPU is not implemented for ",
+    result.layout(),
+    " + ",
+    mat1.layout(),
+    " @ ",
+    mat2.layout());
 }

 /*
@ -496,16 +510,16 @@ void addmm_out_sparse_csr(
               [out] result of the operation.
 */
 void addmv_out_sparse_csr(
-    const Tensor& mat,
-    const Tensor& vec,
-    const Scalar& beta,
-    const Scalar& alpha,
-    const Tensor& result) {
+  const Tensor& mat,
+  const Tensor& vec,
+  const Scalar& beta,
+  const Scalar& alpha,
+  const Tensor& result) {
 #if !AT_USE_MKL_SPARSE()
  TORCH_CHECK(
-      false,
-      "Calling addmv on a sparse CPU tensor requires Linux platform. ",
-      "Please use PyTorch built with MKL on Linux.");
+    false,
+    "Calling addmv on a sparse CPU tensor requires Linux platform. ",
+    "Please use PyTorch built with MKL on Linux.");
 #else
  c10::MaybeOwned<Tensor> result_ = prepare_dense_vector_for_mkl(result);
  c10::MaybeOwned<Tensor> vec_ = prepare_dense_vector_for_mkl(vec);
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -5,38 +5,6 @@
 # representing ScalarType's. They are now superseded by usage of
 # `aten::to()`. The ops remain here for backward compatibility purposes.

-# DEPRECATED. DO NOT USE
- func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
 # Computes the gradient of current tensor w.r.t. graph leaves.
 - func: _backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()
  manual_cpp_binding: True
--- a/aten/src/ATen/native/quantized/cpu/QuantUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/QuantUtils.h
@ -146,12 +146,12 @@ inline TensorQuantizationParams ChooseQuantizationParams(
  // The arithmetic error on the zero point computed from either pair
  // will be roughly machine_epsilon * (sum of absolute values of terms)
  // so we want to use the variant that adds the smaller terms.
-  double zero_point_from_min = qmin - min / static_cast<double>(scale);
-  double zero_point_from_max = qmax - max / static_cast<double>(scale);
+  double zero_point_from_min = qmin - min / scale;
+  double zero_point_from_max = qmax - max / scale;
  double zero_point_from_min_error =
-      std::abs(qmin) - std::abs(min / static_cast<double>(scale));
+      std::abs(qmin) - std::abs(min / scale);
  double zero_point_from_max_error =
-      std::abs(qmax) - std::abs(max / static_cast<double>(scale));
+      std::abs(qmax) - std::abs(max / scale);
  double initial_zero_point =
      zero_point_from_min_error < zero_point_from_max_error
      ? zero_point_from_min
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@ -560,7 +560,7 @@ float hsum_sq(const int32_t* A, int len) {
  alignas(64) float temp[8];
  _mm256_store_ps(temp, sum_ps);
  for (const auto k : c10::irange(8)) {
-    row_sum += static_cast<float>(temp[k]);
+    row_sum += temp[k];
  }
 #elif defined(CPU_CAPABILITY_AVX512)
  __m512 sum_ps = _mm512_setzero_ps();
@ -574,7 +574,7 @@ float hsum_sq(const int32_t* A, int len) {
  alignas(64) float temp[16];
  _mm512_store_ps(temp, sum_ps);
  for (const auto k : c10::irange(16)) {
-    row_sum += static_cast<float>(temp[k]);
+    row_sum += temp[k];
  }
 #endif // CPU_CAPABILITY_AVX2 or CPU_CAPABILITY_AVX512

@ -1282,7 +1282,7 @@ template <bool ReLUFused = false>
 void qadd_scalar_kernel(Tensor& out, const Tensor& self, const Scalar& other) {
  int64_t zero_point = out.q_zero_point();
  float scale = static_cast<float>(out.q_scale());
-  float inv_scale = static_cast<float>(1.0f / scale);
+  float inv_scale = 1.0f / scale;
  int64_t self_zero_point = self.q_zero_point();
  float self_scale = static_cast<float>(self.q_scale());

@ -2915,7 +2915,7 @@ void fake_quantize_learnable_channel_grad_kernel_cpu(
      // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
      *dx_output = (*dy_input) * (xqi >= quant_min && xqi <= quant_max);
      // Calculate gradients for scale and zero point.
-      float xfqi = static_cast<float>((std::max(std::min(xqi, quant_max), quant_min) - (*zero_point_input)) * (*scale_input));
+      float xfqi = ((std::max(std::min(xqi, quant_max), quant_min) - (*zero_point_input)) * (*scale_input));
      if (xqi < quant_min || xqi > quant_max) {
        *dzero_point_output = (*dy_input) * (-1) * (*scale_input) * grad_factor;
        *dscale_output = ((xqi < quant_min) ? ((*dy_input) * dscale_small) : ((*dy_input) * dscale_big)) * grad_factor;
@ -4415,7 +4415,7 @@ void _qmul_tensor_cpu_impl(
    uint8_t y_data = *(y_ptr + idx);
    int32_t x_val = static_cast<int32_t>(x_data) - x_zero_point;
    int32_t y_val = static_cast<int32_t>(y_data) - y_zero_point;
-    int32_t out_val = static_cast<int32_t>(x_val * y_val);
+    int32_t out_val = x_val * y_val;
    float out_val_f = (float)out_val * multiplier;
    if constexpr (std::is_same<T, float>::value) {
      *(out_ptr + idx) = out_val_f;
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
@ -810,7 +810,8 @@ void addmm_out_sparse_csr(
  if (mat1.layout() == kSparseBsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided)
-        return block_sparse_mm(input, mat1, mat2, beta, alpha, result);
+         { block_sparse_mm(input, mat1, mat2, beta, alpha, result); return;
+}
    }
  }

@ -819,13 +820,13 @@ void addmm_out_sparse_csr(
      if (result.layout() == kStrided) {
        auto result_t = result.transpose(-2, -1);
        auto input_t = (result.is_same(input) ? result_t : input.transpose(-2, -1));
-        return block_sparse_mm(
+        block_sparse_mm(
            input_t,
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result_t);
+            result_t); return;
      }
    }
  }
@ -840,41 +841,41 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spmm(
+        spmm(
            mat2.transpose(0, 1).to_sparse_csr(),
            mat1.transpose(0, 1),
            beta,
            alpha,
-            result.transpose(0, 1));
+            result.transpose(0, 1)); return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        return spmm(
+        spmm(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result.transpose(-2, -1));
+            result.transpose(-2, -1)); return;
      }
    }
  }
  if (mat1.layout() == kSparseCsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        return spmm(mat1, mat2, beta, alpha, result);
+        spmm(mat1, mat2, beta, alpha, result); return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr) {
-        return spgemm(mat1, mat2, beta, alpha, result);
+        spgemm(mat1, mat2, beta, alpha, result); return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kSparseCsr) {
        // TODO: Add native CSC support via cuSPARSE if supported.
        // CSR @ CSC kernel would be very fast due to format alignment
-        return spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result); return;
      }
    }
  }
@ -882,27 +883,28 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr)
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
+         { spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
+}
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kSparseCsr) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spgemm(
-            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        spgemm(
+            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result); return;
      }
      if (result.layout() == kSparseCsc) {
-        return spgemm(
+        spgemm(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result.transpose(-2, -1));
+            result.transpose(-2, -1)); return;
      }
    }
  }
@ -933,7 +935,7 @@ void addmv_out_sparse_csr(
    const Scalar& alpha,
    const Tensor& result) {
  if (mat.layout() == kSparseBsr) {
-    return block_sparse_mv(mat, vec, beta, alpha, result);
+    block_sparse_mv(mat, vec, beta, alpha, result); return;
  }
  cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;

@ -1213,9 +1215,9 @@ void triangular_solve_out_sparse_csr(
  }
  if (A.layout() == kSparseBsr) {
    if (B.size(-1) == 1) {
-      return block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular);
+      block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular); return;
    } else {
-      return block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular);
+      block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular); return;
    }
  }
 #ifdef USE_ROCM
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
@ -117,7 +117,7 @@ class FwdKernel:
    def get_all(cls) -> list["FwdKernel"]:
        kernels: list[FwdKernel] = []
        for aligned, dtype, (sm, sm_max) in itertools.product(
-            [True, False], DTYPES.keys(), zip(SM, SM[1:])
+            [True, False], DTYPES.keys(), itertools.pairwise(SM)
        ):
            # Remove some kernels we don't use
            if dtype == "bf16" and sm < 80:
@ -228,7 +228,7 @@ class BwdKernel:
        for aligned, dtype, (sm, sm_max), apply_dropout, max_k in itertools.product(
            [True, False],
            DTYPES.keys(),
-            zip(SM, SM[1:]),
+            itertools.pairwise(SM),
            [True, False],
            [32, 64, 128, 2**16],
        ):
--- a/aten/src/ATen/ops/from_blob.h
+++ b/aten/src/ATen/ops/from_blob.h
@ -90,12 +90,12 @@ class TORCH_API TensorMaker {

  void* data_;
  IntArrayRef sizes_;
-  OptionalIntArrayRef strides_{};
-  std::optional<int64_t> storage_offset_{};
-  std::function<void(void*)> deleter_{};
+  OptionalIntArrayRef strides_;
+  std::optional<int64_t> storage_offset_;
+  std::function<void(void*)> deleter_;
  std::unique_ptr<void, ContextDeleter> ctx_{nullptr, detail::noopDelete};
-  std::optional<Device> device_{};
-  TensorOptions opts_{};
+  std::optional<Device> device_;
+  TensorOptions opts_;
  bool resizeable_{};
  c10::Allocator* allocator_{};
 };
--- a/benchmarks/distributed/bench_nvshmem_tile_reduce.py
+++ b/benchmarks/distributed/bench_nvshmem_tile_reduce.py
@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""
+Benchmark for NVSHMEM tile reduce operations.
+
+Usage:
+python benchmarks/distributed/bench_nvshmem_tile_reduce.py
+
+This benchmark measures the performance of tile reduce operations across different
+matrix sizes and tile configurations.
+"""
+
+import time
+
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+from torch.testing._internal.common_distributed import MultiProcContinuousTest
+from torch.testing._internal.common_utils import (
+    requires_cuda_p2p_access,
+    skip_but_pass_in_sandcastle_if,
+    skipIfRocm,
+)
+
+
+# Decorator
+def requires_nvshmem():
+    return skip_but_pass_in_sandcastle_if(
+        not symm_mem.is_nvshmem_available(),
+        "bench_nvshmem_tile_reduce requires NVSHMEM, skipping benchmark",
+    )
+
+
+# So that benchmarks are written in device-agnostic way
+device_type = "cuda"
+device_module = torch.get_device_module(device_type)
+
+
+@requires_nvshmem()
+@requires_cuda_p2p_access()
+class NVSHMEMTileReduceBenchmark(MultiProcContinuousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+        # Set NVSHMEM as SymmMem backend
+        symm_mem.set_backend("NVSHMEM")
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    def _benchmark_tile_reduce_single(
+        self,
+        full_size: int,
+        tile_size: int,
+        warmup_iters: int = 5,
+        bench_iters: int = 10,
+    ) -> dict:
+        """
+        Benchmark a single configuration of tile reduce.
+
+        Args:
+            full_size: Size of the full matrix (full_size x full_size)
+            warmup_iters: Number of warmup iterations
+            bench_iters: Number of benchmark iterations
+
+        Returns:
+            Dictionary with benchmark results
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+
+        # Allocate full matrices
+        full_inp = symm_mem.empty(
+            full_size, full_size, dtype=dtype, device=self.device
+        ).fill_(self.rank)
+        full_out = symm_mem.empty(
+            full_size, full_size, dtype=dtype, device=self.device
+        ).fill_(0)
+
+        slice_ut = slice(0, tile_size)
+        inp_tile = full_inp[slice_ut, slice_ut]
+        out_tile = full_out[slice_ut, slice_ut]
+
+        root = 0
+
+        # Warmup iterations
+        for _ in range(warmup_iters):
+            torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
+            torch.cuda.synchronize(self.device)
+
+        # Benchmark iterations
+        times = []
+
+        dist.barrier()
+        torch.cuda.synchronize(self.device)
+        start_time = time.perf_counter()
+
+        for _ in range(bench_iters):
+            torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
+
+        torch.cuda.synchronize(self.device)
+        end_time = time.perf_counter()
+        times.append((end_time - start_time) / bench_iters)
+
+        # Calculate statistics
+        times = torch.tensor(times, dtype=torch.float64)
+        tile_elements = tile_size * tile_size
+        tile_bytes = (
+            tile_elements * dtype.itemsize
+            if hasattr(dtype, "itemsize")
+            else tile_elements * 4
+        )
+
+        results = {
+            "full_size": full_size,
+            "tile_size": tile_size,
+            "tile_elements": tile_elements,
+            "tile_bytes": tile_bytes,
+            "world_size": self.world_size,
+            "mean_time_ms": times.mean().item() * 1000,
+            "std_time_ms": times.std().item() * 1000,
+            "min_time_ms": times.min().item() * 1000,
+            "max_time_ms": times.max().item() * 1000,
+            "throughput_gb_s": tile_bytes / (times.mean().item() * 1e9),
+            "elements_per_sec": tile_elements / times.mean().item(),
+        }
+
+        return results
+
+    @skipIfRocm
+    def test_benchmark_tile_reduce_various_sizes(self) -> None:
+        """
+        Benchmark tile reduce across various matrix sizes.
+        """
+        # Test various matrix sizes
+        tile_sizes = [512, 1024, 2048, 4096, 8192, 16384]
+        full_size = tile_sizes[-1]
+        warmup_iters = 5
+        bench_iters = 20
+
+        results = []
+
+        for tile_size in tile_sizes:
+            try:
+                result = self._benchmark_tile_reduce_single(
+                    full_size, tile_size, warmup_iters, bench_iters
+                )
+                results.append(result)
+
+                if self.rank == 0:
+                    print(
+                        f"Matrix Size: {full_size}x{full_size}, Tile Size: {tile_size}x{tile_size}"
+                    )
+                    print(
+                        f"  Mean Time: {result['mean_time_ms']:.3f} ± {result['std_time_ms']:.3f} ms"
+                    )
+                    print(f"  Throughput: {result['throughput_gb_s']:.2f} GB/s")
+                    print(f"  Bytes: {result['tile_bytes']:.0f}")
+                    print()
+
+            except Exception as e:
+                if self.rank == 0:
+                    print(f"Failed to benchmark matrix size {full_size}: {e}")
+
+        # Print summary
+        if self.rank == 0 and results:
+            print("=== BENCHMARK SUMMARY ===")
+            print(
+                f"{'Matrix Size':<12} {'Tile Size':<10} {'Time (ms)':<12} {'Throughput (GB/s)':<18} {'Bytes':<15}"
+            )
+            print("-" * 70)
+
+            for result in results:
+                print(
+                    f"{result['full_size']}x{result['full_size']:<7} "
+                    f"{result['tile_size']}x{result['tile_size']:<5} "
+                    f"{result['mean_time_ms']:<12.3f} "
+                    f"{result['throughput_gb_s']:<18.2f} "
+                    f"{result['tile_bytes']:<15.0f}"
+                )
+
+
+if __name__ == "__main__":
+    # For standalone usage, you'd need to set up distributed environment
+    # For now, this is meant to be run via the PyTorch test framework
+    from torch.testing._internal.common_utils import run_tests
+
+    run_tests()
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@ -6,58 +6,26 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,0


@ -66,10 +34,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -82,10 +46,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -94,10 +54,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -106,18 +62,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -126,26 +74,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
@ -6,58 +6,26 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,0


@ -66,10 +34,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -82,10 +46,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -94,10 +54,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -106,18 +62,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -126,26 +74,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/Show More
+++ b/Show More