Use runner with more memory for ASAN builds

Refactor memory estimator to use node storages, add test (#164783 )
- Update the Memory Estimator to use node storages for analysis, which simplifies book keeping, as opposed to manually looking at operator schema. This will also allow me to reuse this component elsewhere. - Factor out into separate class, so that this same logic can be used in scheduling (node allocations / aliasing / uses) - Adds Tests for correctness - right now only on fwd/bwd by itself, not with both. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164783 Approved by: https://github.com/ruisizhang123 ghstack dependencies: #164738
2025-10-23 14:59:34 +08:00 · 2025-10-08 15:09:43 -07:00 · 2025-10-08 22:07:43 +00:00 · 2025-10-08 22:02:52 +00:00 · 2025-10-08 22:02:52 +00:00 · 2025-10-08 21:56:31 +00:00
677 changed files with 11505 additions and 11217 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -344,7 +344,7 @@ docker build \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
-       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx1100}" \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-27664085f804afc83df26f740bb46c365854f2c4
+7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -46,9 +46,9 @@ case ${DOCKER_TAG_PREFIX} in
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950 conditionally starting in ROCm 7.0
+        # add gfx950, gfx115x conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -115,6 +115,9 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
 # cmake-3.28.0 from pip for onnxruntime
 RUN python3 -mpip install cmake==3.28.0

+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
+
 # build onnxruntime 1.21.0 from sources.
 # it is not possible to build it from sources using pip,
 # so just build it from upstream repository.
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -84,9 +84,9 @@ case ${image} in
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950 conditionally starting in ROCm 7.0
+        # add gfx950, gfx115x conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -120,9 +120,8 @@ ninja==1.11.1.4
 numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
-#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
+#Pinned versions: 0.55.2, 0.60.0
 #test that import: test_numba_integration.py
-#For numba issue see https://github.com/pytorch/pytorch/issues/51511
 #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073

 #numpy
@ -242,10 +241,9 @@ pygments==2.15.0
 #Pinned versions: 14.1.0
 #test that import:

-scikit-image==0.19.3 ; python_version < "3.10"
-scikit-image==0.22.0 ; python_version >= "3.10"
+scikit-image==0.22.0
 #Description: image processing routines
-#Pinned versions:
+#Pinned versions: 0.22.0
 #test that import: test_nn.py

 #scikit-learn
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -5,7 +5,7 @@ DESIRED_ROCM ?= 7.0
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
-#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
+#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201

 DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@ -18,7 +18,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 .PHONY: all
 all: magma-rocm70
 all: magma-rocm64
-all: magma-rocm63

 .PHONY:
 clean:
@ -34,8 +33,3 @@ magma-rocm70:
 magma-rocm64: DESIRED_ROCM := 6.4
 magma-rocm64:
 	$(DOCKER_RUN)
-
-.PHONY: magma-rocm63
-magma-rocm63: DESIRED_ROCM := 6.3
-magma-rocm63:
-	$(DOCKER_RUN)
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -67,7 +67,7 @@ fi
 #       wheels with cxx11-abi

 echo "Checking that the gcc ABI is what we expect"
-if [[ "$(uname)" != 'Darwin' &&  "$(uname -m)" != "s390x" ]]; then
+if [[ "$(uname)" != 'Darwin' ]]; then
  # We also check that there are cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -886,7 +886,7 @@ test_inductor_torchbench_smoketest_perf() {
  done

  # Perform some "warm-start" runs for a few huggingface models.
-  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+  for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
    python benchmarks/dynamo/check_accuracy.py \
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,7 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 # TODO: Move this to .ci/docker/requirements-ci.txt
-python -m pip install "psutil==5.9.1" "pynvml==11.4.1" "pytest-shard==0.1.2"
+python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2"

 run_tests() {
    # Run nvidia-smi if available
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@ -28,6 +28,10 @@ runs:
        echo "instance-type: $(get_ec2_metadata instance-type)"
        echo "system info $(uname -a)"

+    - name: Print GPU info (if present)
+      shell: bash
+      run: if [ -f /usr/bin/nvidia-smi ]; then nvidia-smi; fi
+
    - name: Check if in a container runner
      shell: bash
      id: check_container_runner
@ -82,37 +86,6 @@ runs:
        # Prune all of the docker images
        docker system prune -af

-    - name: Manually resolve download.pytorch.org
-      shell: bash
-      continue-on-error: true
-      run: |
-        set +e
-        set -x
-
-        PT_DOMAIN=download.pytorch.org
-        # TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400,
-        # cleaning this up once the issue is fixed. There are more than one resolved IP here, the last
-        # one is returned at random
-        RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1)
-
-        if [ -z "${RESOLVED_IP}" ]; then
-          echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..."
-          RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1)
-
-          if [ -z "${RESOLVED_IP}" ]; then
-            echo "Couldn't resolve ${PT_DOMAIN}, exiting..."
-            exit 1
-          fi
-        fi
-
-        if grep -r "${PT_DOMAIN}" /etc/hosts; then
-          # Clean up any old records first
-          sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts
-        fi
-
-        echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
-        cat /etc/hosts
-
    - name: Check that the docker daemon is running
      shell: bash
      continue-on-error: true
--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -18,6 +18,7 @@ class GitHubComment:
    body_text: str
    created_at: str
    author_login: str
+    author_url: Optional[str]
    author_association: str
    editor_login: Optional[str]
    database_id: int
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/test_check_labels.py
+++ b/.github/scripts/test_check_labels.py
@ -38,6 +38,7 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text="mock_body_text",
            created_at="",
            author_login="",
+            author_url=None,
            author_association="",
            editor_login=None,
            database_id=1,
@ -48,6 +49,7 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
            created_at="",
            author_login=BOT_AUTHORS[1],
+            author_url=None,
            author_association="",
            editor_login=None,
            database_id=2,
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -32,6 +32,7 @@ from trymerge import (
    main as trymerge_main,
    MandatoryChecksMissingError,
    MergeRule,
+    PostCommentError,
    RE_GHSTACK_DESC,
    read_merge_rules,
    remove_job_name_suffix,
@ -588,6 +589,23 @@ class TestTryMerge(TestCase):
            self.assertEqual(mock_merge_base, pr.get_merge_base())
            mocked_gh_fetch_merge_base.assert_called_once()

+    def test_app_can_revert(self, *args: Any) -> None:
+        pr = GitHubPR("pytorch", "pytorch", 164660)
+        repo = DummyGitRepo()
+        app_comment_id, impostor_comment_id = 3375785595, 3377647892
+        # Check that app can revert
+        self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id))
+        # But impostor can not
+        self.assertRaises(
+            PostCommentError,
+            lambda: validate_revert(repo, pr, comment_id=impostor_comment_id),
+        )
+        # Despite it's name being the name of the bot
+        self.assertEqual(
+            pr.get_comment_by_id(impostor_comment_id).author_login,
+            "pytorch-auto-revert",
+        )
+

@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -234,6 +234,7 @@ query ($owner: String!, $name: String!, $number: Int!) {
          createdAt
          author {
            login
+            url
          }
          authorAssociation
          editor {
@ -1093,6 +1094,7 @@ class GitHubPR:
            body_text=node["bodyText"],
            created_at=node["createdAt"] if "createdAt" in node else "",
            author_login=node["author"]["login"],
+            author_url=node["author"].get("url", None),
            author_association=node["authorAssociation"],
            editor_login=editor["login"] if editor else None,
            database_id=node["databaseId"],
@ -2029,6 +2031,11 @@ def validate_revert(
    # For some reason, one can not be a member of private repo, only CONTRIBUTOR
    if pr.is_base_repo_private():
        allowed_reverters.append("CONTRIBUTOR")
+    # Special case the pytorch-auto-revert app, whose does not have association
+    # But should be able to issue revert command
+    if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
+        allowed_reverters.append("NONE")
+
    if author_association not in allowed_reverters:
        raise PostCommentError(
            f"Will not revert as @{author_login} is not one of "
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100

 on:
  schedule:
-    - cron: 15 0,12 * * 1-6
+    - cron: 15 0 * * 1-6
    - cron: 0 7 * * 0
  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@ -63,6 +63,7 @@ jobs:
      # Same as the build job
      python-version: 3.12.7
      test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
+      timeout-minutes: 300
      disable-monitor: false
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -128,6 +128,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.2xlarge.memory
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
      test-matrix: |
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -59,3 +59,29 @@ jobs:
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
+
+  linux-jammy-rocm-py3_10-gfx1100-test:
+    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3_10-gfx1100
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
+        ]}
+      tests-to-include: >
+         test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
+         test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
+         inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
+         inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
+         inductor/test_flex_attention inductor/test_max_autotune
+    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -88,7 +88,7 @@ torch_compile_debug/
 # Listed manually because some files in this directory are not generated
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
-torch/csrc/api/include/torch/version.h
+torch/headeronly/version.h
 torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -28,7 +28,7 @@ exclude_patterns = [
    'torch/lib/**',
    'venv/**',
    '**/*.pyi',
-    "tools/experimental/dynamic_shapes/torchfuzz/**",
+    "tools/experimental/torchfuzz/**",
    'tools/test/test_selective_build.py',
 ]
 command = [
@ -198,7 +198,7 @@ exclude_patterns = [
    'tools/test/gen_operators_yaml_test.py',
    'tools/test/gen_oplist_test.py',
    'tools/test/test_selective_build.py',
-    'tools/experimental/dynamic_shapes/torchfuzz/**',
+    'tools/experimental/torchfuzz/**',
 ]
 command = [
    'python3',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -13,6 +13,9 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt
 load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
 load("//:tools/bazel.bzl", "rules")

+# Export files for use by torch/headeronly (where version.h generation now lives)
+exports_files(["version.txt"])
+
 define_targets(rules = rules)

 COMMON_COPTS = [
@ -690,7 +693,9 @@ cc_library(
            "torch/csrc/*/generated/*.h",
            "torch/csrc/jit/serialization/mobile_bytecode_generated.h",
        ] + torch_cuda_headers,
-    ) + GENERATED_AUTOGRAD_CPP + [":version_h"],
+    ) + GENERATED_AUTOGRAD_CPP + [
+        "//torch/headeronly:version_h",
+    ],
    includes = [
        "third_party/kineto/libkineto/include",
        "torch/csrc",
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -483,8 +483,8 @@ at::BlasBackend Context::blasPreferredBackend() {
 #if ROCM_VERSION >= 60300
          "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
 #endif
-#if ROCM_VERSION >= 60500
-          "gfx950"
+#if ROCM_VERSION >= 70000
+          "gfx950", "gfx1150", "gfx1151"
 #endif
      };
      for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
@ -587,20 +587,33 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
  rocm_fa_preferred_backend = b;
 }

-bool Context::allowFP16ReductionCuBLAS() const {
+CuBLASReductionOption Context::allowFP16ReductionCuBLAS() const {
  return allow_fp16_reduction_cublas;
 }

-void Context::setAllowFP16ReductionCuBLAS(bool b) {
-  allow_fp16_reduction_cublas = b;
+CuBLASReductionOption inline get_reduction_option(bool allow_reduced_precision, bool allow_splitk) {
+  TORCH_CHECK(
+      !(allow_reduced_precision && !allow_splitk),
+      "allow_splitk=False is not supported when reduced precision reductions are enabled");
+  if (allow_reduced_precision) {
+    return CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
+  } else if (allow_splitk) {
+    return CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK;
+  } else {
+    return CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK;
+  }
 }

-bool Context::allowBF16ReductionCuBLAS() const {
+void Context::setAllowFP16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
+  allow_fp16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
+}
+
+CuBLASReductionOption Context::allowBF16ReductionCuBLAS() const {
  return allow_bf16_reduction_cublas;
 }

-void Context::setAllowBF16ReductionCuBLAS(bool b) {
-  allow_bf16_reduction_cublas = b;
+void Context::setAllowBF16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
+  allow_bf16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
 }

 bool Context::allowFP16AccumulationCuBLAS() const {
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -38,6 +38,12 @@ namespace at {
 class Tensor;

 enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
+
+enum class CuBLASReductionOption : uint8_t {
+  AllowReducedPrecisionWithSplitK = 0,
+  DisallowReducedPrecisionAllowSplitK = 1,
+  DisallowReducedPrecisionDisallowSplitK = 2,
+};
 enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN };
 enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL };
 enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 };
@ -357,10 +363,14 @@ class TORCH_API Context {
  void setAllowTF32CuBLAS(bool);
  Float32MatmulPrecision float32MatmulPrecision() const;
  Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
-  bool allowFP16ReductionCuBLAS() const;
-  void setAllowFP16ReductionCuBLAS(bool);
-  bool allowBF16ReductionCuBLAS() const;
-  void setAllowBF16ReductionCuBLAS(bool);
+  CuBLASReductionOption allowFP16ReductionCuBLAS() const;
+  void setAllowFP16ReductionCuBLAS(
+      bool allow_reduced_precision,
+      bool allow_splitk = true);
+  CuBLASReductionOption allowBF16ReductionCuBLAS() const;
+  void setAllowBF16ReductionCuBLAS(
+      bool allow_reduced_precision,
+      bool allow_splitk = true);
  bool allowFP16AccumulationCuBLAS() const;
  void setAllowFP16AccumulationCuBLAS(bool);

@ -452,8 +462,10 @@ class TORCH_API Context {
      : at::Float32MatmulPrecision::HIGHEST;
  int benchmark_limit_cudnn = 10;
  bool allow_tf32_cudnn = true;
-  bool allow_fp16_reduction_cublas = true;
-  bool allow_bf16_reduction_cublas = true;
+  CuBLASReductionOption allow_fp16_reduction_cublas =
+      CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
+  CuBLASReductionOption allow_bf16_reduction_cublas =
+      CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
  bool allow_fp16_accumulation_cublas = false;
  std::optional<int32_t> sm_carveout = std::nullopt;
  bool enabled_mkldnn = true;
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -229,14 +229,14 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
  }

  void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef<int64_t> size) {
-    return _resize_(sparse_dim, dense_dim, size);
+    _resize_(sparse_dim, dense_dim, size);
  }

  void resize_(
      int64_t sparse_dim,
      int64_t dense_dim,
      ArrayRef<c10::SymInt> size) {
-    return _resize_(sparse_dim, dense_dim, size);
+    _resize_(sparse_dim, dense_dim, size);
  }

  // NOTE: this function will resize the sparse tensor and also set `indices`
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@ -59,7 +59,7 @@ static inline void set_item(const Tensor& self, ArrayRef<TensorIndex> indices, c
    }
  }

-  return set_item(self, indices, value);
+  set_item(self, indices, value);
 }

 } // namespace indexing
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@ -765,7 +765,8 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
  if (numel == 0) {
    return;
  } else if (numel < grain_size || at::get_num_threads() == 1) {
-    return serial_for_each(loop, {0, numel});
+    serial_for_each(loop, {0, numel});
+    return;
  } else {
    at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
      serial_for_each(loop, {begin, end});
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@ -49,7 +49,7 @@ static void check_unique_names(DimnameList names) {
 }

 void check_names_valid_for(const TensorBase& tensor, DimnameList names) {
-  return impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
+  impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
 }

 void check_names_valid_for(size_t tensor_dim, DimnameList names) {
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -138,7 +138,7 @@ void Tensor::_backward(TensorList inputs,
        const std::optional<Tensor>& gradient,
        std::optional<bool> keep_graph,
        bool create_graph) const {
-  return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
+  impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
 }

 const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
@ -173,4 +173,12 @@ unsigned TensorBase::_register_hook(std::function<TensorBase(const TensorBase&)>
  return impl::GetVariableHooks()->_register_hook(*this, std::move(hook));
 }

+std::optional<ScalarType> TensorBase::grad_dtype() const {
+  return impl::GetVariableHooks()->grad_dtype(*this);
+}
+
+void TensorBase::set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const {
+  return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype);
+}
+
 } // namespace at
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -930,6 +930,10 @@ public:

  const TensorBase& requires_grad_(bool _requires_grad=true) const;

+  std::optional<ScalarType> grad_dtype() const;
+
+  void set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const;
+
  // View Variables
  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@ -68,6 +68,8 @@ struct TORCH_API VariableHooksInterface {
      const c10::OperatorHandle& op,
      c10::DispatchKeySet dispatch_keys,
      torch::jit::Stack* stack) const = 0;
+  virtual std::optional<c10::ScalarType> grad_dtype(const TensorBase&) const = 0;
+  virtual void set_grad_dtype(const TensorBase&, const std::optional<c10::ScalarType>&) const = 0;
 };

 TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -496,7 +496,7 @@ class TORCH_API OperatorHandle {
  }

  void checkInvariants() const {
-    return operatorDef_->op.checkInvariants();
+    operatorDef_->op.checkInvariants();
  }

  c10::ArrayRef<at::Tag> getTags() const {
@ -932,7 +932,7 @@ inline void Dispatcher::redispatchBoxed(
  }
 #endif
  const auto& kernel = entry.lookup(dispatchKeySet);
-  return kernel.callBoxed(op, dispatchKeySet, stack);
+  kernel.callBoxed(op, dispatchKeySet, stack);
 }

 } // namespace c10
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -422,18 +422,34 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
    abType = CUDA_R_16F;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
 #ifndef USE_ROCM
-    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
-      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
-        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
+    if (fp16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      uint32_t mask =
+          fp16_reduction ==
+                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
+              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
+                 CUBLASLT_REDUCTION_SCHEME_NONE)
+              : CUBLASLT_REDUCTION_SCHEME_NONE;
+      preference.setAttribute(
+          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
    }
 #endif
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
    abType = CUDA_R_16BF;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
 #ifndef USE_ROCM
-    if (!at::globalContext().allowBF16ReductionCuBLAS()) {
-      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
-        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
+    if (bf16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      uint32_t mask =
+          bf16_reduction ==
+                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
+              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
+                 CUBLASLT_REDUCTION_SCHEME_NONE)
+              : CUBLASLT_REDUCTION_SCHEME_NONE;
+      preference.setAttribute(
+          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
    }
 #endif
  } else {
@ -1120,8 +1136,15 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
  }
  if (prop->major >= 5) {
    cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
-    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
-      cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
+    TORCH_CHECK(fp16_reduction !=
+        at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
+          "torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
+          "..., allow_splitk=False) requires the cuBLASLt backend");
+    if (fp16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      cublas_flags = static_cast<cublasMath_t>(
+          cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
    }
    // Disallow fp16 reductions that could lead to unexpected overflow issues.
    TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
@ -1180,8 +1203,15 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT
  GEMM_CHECK_ARGVALUES(at::BFloat16);
 #ifndef USE_ROCM
  cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
-  if (!at::globalContext().allowBF16ReductionCuBLAS()) {
-    cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+  auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
+  TORCH_CHECK(bf16_reduction !=
+      at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
+        "torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
+        "..., allow_splitk=False) requires the cuBLASLt backend");
+  if (bf16_reduction !=
+      at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+    cublas_flags = static_cast<cublasMath_t>(
+        cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
  }
 #endif
 #if defined(USE_ROCM)
@ -1270,7 +1300,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
  }
 #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-    if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
+    if (at::detail::getCUDAHooks().isGPUArch({"gfx11", "gfx12"})) { //no CK GEMM version
      gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
    } else{
      at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
@ -1577,18 +1607,34 @@ bool gemm_and_bias(
    abType = CUDA_R_16F;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
 #ifndef USE_ROCM
-    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
-      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
-        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
+    if (fp16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      uint32_t mask =
+          fp16_reduction ==
+                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
+              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
+                 CUBLASLT_REDUCTION_SCHEME_NONE)
+              : CUBLASLT_REDUCTION_SCHEME_NONE;
+      preference.setAttribute(
+          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
    }
 #endif
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
    abType = CUDA_R_16BF;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
 #ifndef USE_ROCM
-    if (!at::globalContext().allowBF16ReductionCuBLAS()) {
-      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
-        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
+    if (bf16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      uint32_t mask =
+          bf16_reduction ==
+                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
+              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
+                 CUBLASLT_REDUCTION_SCHEME_NONE)
+              : CUBLASLT_REDUCTION_SCHEME_NONE;
+      preference.setAttribute(
+          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
    }
 #endif
  }
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -326,6 +326,23 @@ bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
 #endif
 }

+bool CUDAHooks::supportsBFloat16RNNWithCuDNN() const {
+#if AT_CUDNN_ENABLED() && (CUDNN_VERSION >= 91300)
+  if (!hasCUDA()) {
+    return false;
+  }
+  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+  // Check for Volta cores
+  if (prop->major >= 8) {
+    return true;
+  } else {
+    return false;
+  }
+#else
+  return false;
+#endif
+}
+
 long CUDAHooks::versionCuDNN() const {
 #if AT_CUDNN_ENABLED()
  return CUDNN_VERSION;
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -45,6 +45,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  bool supportsDilatedConvolutionWithCuDNN() const override;
  bool supportsDepthwiseConvolutionWithCuDNN() const override;
  bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
+  bool supportsBFloat16RNNWithCuDNN() const override;
  bool hasCUDART() const override;
  long versionCUDART() const override;
  long versionCuDNN() const override;
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -166,6 +166,10 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    return false;
  }

+  virtual bool supportsBFloat16RNNWithCuDNN() const {
+    return false;
+  }
+
  virtual long versionCuDNN() const {
    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
  }
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@ -465,11 +465,11 @@ static void dynamicLayerBack(const c10::OperatorHandle& op, torch::jit::Stack* s

 // used for functions that have aliasing operations but should be treated like they're out of place (i.e. lift_fresh)
 static void dynamicLayerBackGradSpecialCase(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  return dynamicLayerBack(op, stack, true);
+  dynamicLayerBack(op, stack, true);
 }

 static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  return dynamicLayerBack(op, stack, false);
+  dynamicLayerBack(op, stack, false);
 }

 TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerFrontMode, m) {
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@ -12,7 +12,7 @@

 #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
 #define MPS_ERROR_RUNTIME_TOO_LOW \
-  "The MPS backend is supported on MacOS 13.0+.", \
+  "The MPS backend is supported on MacOS 14.0+. ", \
  "Current OS version can be queried using `sw_vers`"
 #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
  "as the MPS framework doesn't support float64. Please use float32 instead."
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -375,7 +375,7 @@ static void bf16_gemv_trans(
  const at::BFloat16 beta,
  at::BFloat16* y,
  const int incy) {
-  return bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
+  bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }

 template <>
--- a/aten/src/ATen/native/BucketizationUtils.h
+++ b/aten/src/ATen/native/BucketizationUtils.h
@ -70,7 +70,7 @@ inline void searchsorted_maybe_trim_input_tensors(
    const Tensor& raw_boundaries) {
  Tensor trimmed_sorter;
  Tensor raw_sorter;
-  return searchsorted_maybe_trim_input_tensors(
+  searchsorted_maybe_trim_input_tensors(
      trimmed_input,
      trimmed_boundaries,
      trimmed_sorter,
--- a/aten/src/ATen/native/GridSamplerUtils.h
+++ b/aten/src/ATen/native/GridSamplerUtils.h
@ -93,6 +93,12 @@ inline bool cond_cudnn_grid_sampler(
  const TensorBase& input,
  const TensorBase& grid
 ) {
+  auto st = input.scalar_type();
+  if (!(st == kDouble || st == kFloat || st == kHalf))
+    return false;
+  st = grid.scalar_type();
+  if (!(st == kDouble || st == kFloat || st == kHalf))
+    return false;
  return (
    at::native::cudnn_is_acceptable(input) &&
    at::native::cudnn_is_acceptable(grid) &&
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -108,6 +108,13 @@ bool use_mkldnn(const Tensor& input, TensorList params, TensorList hx) {
  return false;
 }

+bool use_cudnn(const Tensor& t) {
+  bool acceptable = at::cudnn_is_acceptable(t);
+  auto st = t.scalar_type();
+  bool bfloat16_cond = st == kBFloat16 && at::detail::getCUDAHooks().supportsBFloat16RNNWithCuDNN();
+  return acceptable && (bfloat16_cond || st == kDouble || st == kFloat || st == kHalf);
+}
+
 template<typename T>
 using pair_of = std::pair<T, T>;

@ -1200,7 +1207,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
      bool train,                                                           \
      bool bidirectional,                                                   \
      bool batch_first) {                                                   \
-    if (at::cudnn_is_acceptable(_input)) {                                  \
+    if (use_cudnn(_input)) {                                                \
      Tensor output, hy;                                                    \
      NAME##_cudnn_stub(                                                    \
          _input.device().type(),                                           \
@ -1262,7 +1269,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
      double dropout_p,                                                     \
      bool train,                                                           \
      bool bidirectional) {                                                 \
-    if (at::cudnn_is_acceptable(data)) {                                    \
+    if (use_cudnn(data)) {                                                  \
      Tensor output, hy;                                                    \
      NAME##_packed_cudnn_stub(                                             \
          data.device().type(),                                             \
@ -1430,7 +1437,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
      TensorList _params, bool has_biases,
      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (at::cudnn_is_acceptable(_input)) {
+  if (use_cudnn(_input)) {
    Tensor output, hy, cy;
    lstm_cudnn_stub(_input.device().type(), output, hy, cy, _input, hx, _params, has_biases,
            num_layers, dropout_p, train, bidirectional, batch_first);
@ -1491,7 +1498,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
      TensorList _params, bool has_biases,
      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (at::cudnn_is_acceptable(data)) {
+  if (use_cudnn(data)) {
    Tensor output, hy, cy;
    lstm_packed_cudnn_stub(data.device().type(), output, hy, cy, data, batch_sizes, hx,
            _params, has_biases, num_layers, dropout_p, train, bidirectional);
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@ -23,14 +23,6 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
-#include <ATen/ops/_cast_Byte_native.h>
-#include <ATen/ops/_cast_Char_native.h>
-#include <ATen/ops/_cast_Double_native.h>
-#include <ATen/ops/_cast_Float_native.h>
-#include <ATen/ops/_cast_Half_native.h>
-#include <ATen/ops/_cast_Int_native.h>
-#include <ATen/ops/_cast_Long_native.h>
-#include <ATen/ops/_cast_Short_native.h>
 #include <ATen/ops/_dim_arange_native.h>
 #include <ATen/ops/_efficientzerotensor_native.h>
 #include <ATen/ops/_empty_affine_quantized.h>
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@ -91,9 +91,6 @@ bool cudnn_is_acceptable(const TensorBase& self) {
    return false;
  if (!self.is_cuda())
    return false;
-  auto st = self.scalar_type();
-  if (!(st == kDouble || st == kFloat || st == kHalf))
-    return false;
  if (!detail::getCUDAHooks().compiledWithCuDNN())
    return false;
  // cuDNN functions like grid_sampler returns CUDNN_STATUS_BAD_PARAM on empty
--- a/aten/src/ATen/native/VariableMethodStubs.cpp
+++ b/aten/src/ATen/native/VariableMethodStubs.cpp
@ -25,11 +25,11 @@
 namespace at::native {

 void _backward(const Tensor& self, TensorList inputs, const std::optional<Tensor>& gradient_opt, std::optional<bool> keep_graph, bool create_graph) {
-  return self._backward(inputs, gradient_opt, keep_graph, create_graph);
+  self._backward(inputs, gradient_opt, keep_graph, create_graph);
 }

 void set_data(Tensor& self, const Tensor& new_data) {
-  return self.set_data(new_data);
+  self.set_data(new_data);
 }

 Tensor data(const Tensor& self) {
@ -54,7 +54,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) {
 }

 void retain_grad(Tensor& self) {
-  return self.retain_grad();
+  self.retain_grad();
 }

 bool retains_grad(const Tensor& self) {
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@ -300,7 +300,8 @@ void div_floor_kernel(TensorIteratorBase& iter) {
    // In the special case of unsigned integer division, floor division is
    // equivalent to truncation division (since the signs of the divisor and
    // dividend are always the same)
-    return div_trunc_kernel(iter);
+    div_trunc_kernel(iter);
+    return;
  } else if (isIntegralType(dtype, /*includeBool*/ false)) {
    // There's no SIMD integer division, so don't try to vectorize it.
    AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cpu", [&]() {
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@ -749,21 +749,29 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
        // });

        if (iter_dtype == kByte) {
-          return cpu_hflip_vec<uint8_t>(iter);
+          cpu_hflip_vec<uint8_t>(iter);
+          return;
        } else if (iter_dtype == kChar) {
-          return cpu_hflip_vec<int8_t>(iter);
+          cpu_hflip_vec<int8_t>(iter);
+          return;
        } else if (iter_dtype == kInt) {
-          return cpu_hflip_vec<int32_t>(iter);
+          cpu_hflip_vec<int32_t>(iter);
+          return;
        } else if (iter_dtype == kLong) {
-          return cpu_hflip_vec<int64_t>(iter);
+          cpu_hflip_vec<int64_t>(iter);
+          return;
        } else if (iter_dtype == kShort) {
-          return cpu_hflip_vec<int16_t>(iter);
+          cpu_hflip_vec<int16_t>(iter);
+          return;
        } else if (iter_dtype == kBool) {
-          return cpu_hflip_vec<bool>(iter);
+          cpu_hflip_vec<bool>(iter);
+          return;
        } else if (iter_dtype == kFloat) {
-          return cpu_hflip_vec<float>(iter);
+          cpu_hflip_vec<float>(iter);
+          return;
        } else if (iter_dtype == kDouble) {
-          return cpu_hflip_vec<double>(iter);
+          cpu_hflip_vec<double>(iter);
+          return;
        }
      }
      // other dtypes (float16, bfloat16, complex) are handled by cpu_kernel_vec (see below)
@ -778,10 +786,12 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
          c == input_strides_2[1] &&
          c == iter.element_size(0) * iter.shape()[0]  // checks if dim=1 is contiguous as well
      ) {
-        return cpu_hflip_channels_last_vec(iter);
+        cpu_hflip_channels_last_vec(iter);
+        return;
      }
      // Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec)
-      return cpu_vflip_memcpy(iter);
+      cpu_vflip_memcpy(iter);
+      return;
    }

    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(), "flip_cpu",
--- a/aten/src/ATen/native/cpu/PowKernel.cpp
+++ b/aten/src/ATen/native/cpu/PowKernel.cpp
@ -96,11 +96,14 @@ static void pow_tensor_scalar_kernel(
      dtype == kBFloat16 || isComplexType(dtype)) {
    // Dispatch to fast specialization for sqrt, rsqrt and reciprocal
    if (exp_scalar.equal(.5)) {
-      return sqrt_kernel(iter);
+      sqrt_kernel(iter);
+      return;
    } else if (exp_scalar.equal(-0.5)) {
-      return rsqrt_kernel(iter);
+      rsqrt_kernel(iter);
+      return;
    } else if (exp_scalar.equal(-1.0)) {
-      return reciprocal_kernel(iter);
+      reciprocal_kernel(iter);
+      return;
    }
  }

--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -256,10 +256,10 @@ static void norm_kernel_tensor_iterator_impl(
  } else {
    if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) {
      // type promotion that does cast and reduction in a single kernel
-      return norm_kernel_cpu_impl<at::Half, float>(iter, val);
+      norm_kernel_cpu_impl<at::Half, float>(iter, val); return;
    } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
      // type promotion that does cast and reduction in a single kernel
-      return norm_kernel_cpu_impl<at::BFloat16, float>(iter, val);
+      norm_kernel_cpu_impl<at::BFloat16, float>(iter, val); return;
    }

    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kComplexHalf, iter.input_dtype(), "norm_cpu", [&] {
--- a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
@ -428,10 +428,11 @@ void fp16_gemv_trans(
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0);
 #if !defined(__aarch64__) || defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
  if (at::globalContext().allowFP16ReductionCPU()) {
-    return fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+    fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+    return;
  }
 #endif
-  return fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+  fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
 }

 float bf16_dot_with_fp32_arith(const at::BFloat16* vec1, const at::BFloat16* vec2, int64_t len) {
@ -465,7 +466,7 @@ void bf16_gemv_trans(
  at::BFloat16* y,
  const int incy) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0 && beta == 0.0);
-  return bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
+  bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
 }

 float fp16_dot(
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -285,8 +285,8 @@ static bool isSupportedHipLtROCmArch(int index) {
 #if ROCM_VERSION >= 60300
        "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
 #endif
-#if ROCM_VERSION >= 60500
-        "gfx950"
+#if ROCM_VERSION >= 70000
+        "gfx950", "gfx1150", "gfx1151"
 #endif
    };
    return at::detail::getCUDAHooks().isGPUArch(archs, index);
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@ -59,7 +59,7 @@ constexpr uint64_t getDefaultMaxThreadsPerBlock() {
 #ifdef USE_ROCM
 #define SKIP_SORTED_INDICES 32
 template <typename scalar_t, int SZ>
-__global__ void indexing_backward_kernel(
+__global__ void indexing_backward_kernel_many_indices(
  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
  int64_t numel, int64_t stride, int64_t stride_before, int64_t outer_dim, bool accumulate) {
  using opmath_t = at::opmath_type<scalar_t>;
@ -254,7 +254,8 @@ __global__ void indexing_backward_kernel_stride_1(
    }
  }
 }
-#else
+#endif
+
 template <typename scalar_t, int SZ>
 __global__ void indexing_backward_kernel(
  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
@ -333,6 +334,7 @@ __global__ void indexing_backward_kernel(
  }
 }

+#ifndef USE_ROCM
 template <typename scalar_t>
 __global__ void indexing_backward_kernel_stride_1(
  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
@ -780,11 +782,43 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
            kBool,
            kBFloat16);
        } else {
+#ifdef USE_ROCM
+          if (num_indices >= 200000)
+            AT_DISPATCH_V2(
+              expandedValue.scalar_type(),
+              "indexing_backward_many_indices",
+              AT_WRAP([&] {
+                indexing_backward_kernel_many_indices<scalar_t, UNROLL><<<new_grid, block, smem_dups_size, stream>>>(
+                  sorted_indices.const_data_ptr<int64_t>(),
+                  orig_indices.const_data_ptr<int64_t>(),
+                  expandedValue.const_data_ptr<scalar_t>(),
+                  src_.mutable_data_ptr<scalar_t>(),
+                  num_indices,
+                  sliceSize,
+                  strideBefore,
+                  nElemBefore,
+                  accumulate);
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
+              }),
+              AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
+              // AT_EXPAND(AT_FLOAT8_TYPES),
+              // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+              // should not be supported here, then reenable AT_FLOAT8_DTYPES
+              kFloat8_e4m3fn,
+              kFloat8_e5m2,
+              kFloat8_e4m3fnuz,
+              kFloat8_e5m2fnuz,
+              kComplexHalf,
+              kHalf,
+              kBool,
+              kBFloat16);
+          else
+#endif
          AT_DISPATCH_V2(
            expandedValue.scalar_type(),
            "indexing_backward",
            AT_WRAP([&] {
-              indexing_backward_kernel<scalar_t, UNROLL><<<KERNEL_GRID, block, KERNEL_SMEM, stream>>>(
+              indexing_backward_kernel<scalar_t, UNROLL><<<grid, block, 0, stream>>>(
                sorted_indices.const_data_ptr<int64_t>(),
                orig_indices.const_data_ptr<int64_t>(),
                expandedValue.const_data_ptr<scalar_t>(),
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@ -121,7 +121,7 @@ void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_si
    "cufft_set_plan_cache_max_size: expected 0 <= device_index < ",
    at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
    device_index);
-  return cufft_get_plan_cache(device_index).resize(max_size);
+  cufft_get_plan_cache(device_index).resize(max_size);
 }

 int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index) {
@ -137,7 +137,7 @@ void cufft_clear_plan_cache_impl(DeviceIndex device_index) {
    "cufft_clear_plan_cache: expected 0 <= device_index < ",
    at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
    device_index);
-  return cufft_get_plan_cache(device_index).clear();
+  cufft_get_plan_cache(device_index).clear();
 }

 } // namespace at::native::detail
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@ -230,7 +230,7 @@ constexpr int BLOCK_THREADS = 256;
 constexpr int RADIX_BITS = 8;
 constexpr int RADIX_DIGITS = 1 << RADIX_BITS; // 2 ^ RADIX_BITS
 constexpr int RADIX_MASK = (RADIX_DIGITS - 1);
-static_assert(RADIX_DIGITS <= BLOCK_THREADS, "radixFindKthValues kernel requires RADIX_DIGITS <= BLOCK_THREADS");
+static_assert(RADIX_DIGITS <= BLOCK_THREADS, "RADIX_DIGITS must be <= BLOCK_THREADS");
 constexpr int MIN_ITEMS_PER_THREAD = 4;
 constexpr int MAX_ITEMS_PER_THREAD = 64;

@ -242,11 +242,10 @@ __global__ void fill(T* x, T value, IndexType size) {
  }
 }

-// find the kth smallest value,
-// for largest topk, k_to_find = slice_size - k + 1
+// compute local histogram for each block
 template <typename T, typename IndexType, typename Bitwise, int Dim>
 C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
-__global__ void radixFindKthValues(
+__global__ void computeBlockDigitCounts(
    at::cuda::detail::TensorInfo<const T, IndexType> input,
    uint32_t slice_size,
    uint32_t* ks_to_find,  // size: num_slices, unused arg but for mysterious reasons perf is better when it's present
@ -321,12 +320,51 @@ __global__ void radixFindKthValues(
  }
 }

+// compute global histogram and cumsum for each row
+__global__ void computeDigitCumSum(
+  short* counts,
+  uint32_t* digit_cum_sum,
+  uint32_t blocks_per_slice) {
+  int tidx = threadIdx.x + blockIdx.x * blockDim.x;
+  int digit_idx = threadIdx.x;
+  uint32_t slice_idx = blockIdx.x;
+
+  typedef cub::BlockScan<uint32_t, RADIX_DIGITS> BlockScan;
+  __shared__ typename BlockScan::TempStorage scan_storage;
+  // accumulates counters from multiple blocks
+  uint32_t digit_count = 0;
+  if (threadIdx.x < RADIX_DIGITS) {
+    constexpr int HISTO_ACCUM_TILE = 4;
+    uint32_t rounds = blocks_per_slice / HISTO_ACCUM_TILE;
+    for (int iter = 0; iter < rounds; iter++)  {
+      int base = HISTO_ACCUM_TILE * iter;
+      #pragma unroll
+      for (int j = 0; j < HISTO_ACCUM_TILE; j++) {
+        int blk = base + j;
+        digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + digit_idx];
+      }
+    }
+    for (int blk = HISTO_ACCUM_TILE * rounds; blk < blocks_per_slice; blk++)  {
+      digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + digit_idx];
+    }
+
+  }
+  // compute the block-wide inclusive prefix sum
+  uint32_t digit_count_cumsum;
+  BlockScan(scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
+  __syncthreads();
+  if (threadIdx.x < RADIX_DIGITS) {
+    digit_cum_sum[tidx] = digit_count_cumsum;
+  }
+}
+
 // Assumption: k can not be larger than UINT32_MAX
 template <typename Bitwise, typename T>
 C10_LAUNCH_BOUNDS_1(RADIX_DIGITS)  // one thread per digit
 __global__ void computeBlockwiseWithinKCounts(
  Bitwise* desires_in,          // size: num_slices
  short* counts,             // size: num_slices * blocks_per_slice * radix_digits
+  uint32_t* digit_cum_sum,
  uint32_t* ks_to_find_in,  // size: num_slices
  uint32_t blocks_per_slice,
  int current_bit,
@ -338,7 +376,7 @@ __global__ void computeBlockwiseWithinKCounts(
  Bitwise* desires_out,
  uint32_t num_blocks
 ) {
-  // This kernel should be launched with the same number of blocks as the `radixFindKthValues` kernel.
+  // This kernel should be launched with the same number of blocks as the `computeBlockDigitCounts` kernel.
  int tidx = threadIdx.x;
  uint32_t block_idx = getLinearBlockId<uint32_t>();
  uint32_t slice_idx = block_idx / blocks_per_slice;
@ -351,36 +389,15 @@ __global__ void computeBlockwiseWithinKCounts(
  if (block_idx >= num_blocks) {
    return;
  }
-  typedef cub::BlockScan<uint32_t, BLOCK_THREADS> BlockScan;
-  union __align__(16) TempStorage {
-    uint32_t digit_count_cumsum[RADIX_DIGITS]; // only used if this it the last block for this slice
-    typename BlockScan::TempStorage scan_storage;
-  };
-  __shared__ TempStorage temp_storage;

-  // accumulates counters from multiple blocks
-  uint32_t digit_count = 0;
-  if (tidx < RADIX_DIGITS) {
-    for (int blk = 0; blk < blocks_per_slice; ++blk) {
-      digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + tidx];
-    }
-  }
-
-  // compute the block-wide inclusive prefix sum
-  uint32_t digit_count_cumsum;
-  BlockScan(temp_storage.scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
-  __syncthreads();
-  // every thread also need the perfix_sum of it's left value for comparison, so save a copy in shared mem
-  if (tidx < RADIX_DIGITS) {
-    temp_storage.digit_count_cumsum[tidx] = digit_count_cumsum;
-  }
-  __syncthreads();

  __shared__ Bitwise desired;
  uint32_t k_to_find = ks_to_find_in[slice_idx];

  if (tidx < RADIX_DIGITS) {
-    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : temp_storage.digit_count_cumsum[tidx - 1];
+    uint32_t position = slice_idx * RADIX_DIGITS + tidx;
+    uint32_t digit_count_cumsum = digit_cum_sum[position];
+    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : digit_cum_sum[position - 1];

    // if not the last pass: update desired and ks_to_find
    // if last pass: write out the kth value
@ -466,7 +483,7 @@ template <typename Bitwise>
 __global__ void computeBlockwiseKthCounts(
  Bitwise* desires,            // size: num_slices
  short* counts,               // size: num_slices * blocks_per_slice * radix_digits
-  uint32_t num_blocks,         // the number of blocks used by `radixFindKthValues` kernel
+  uint32_t num_blocks,         // the number of blocks used by `computeBlockDigitCounts` kernel
  uint32_t blocks_per_slice,
  // outputs:
  uint32_t* kthCounts          // size: num_slices * blocks_per_slice == num_blocks
@ -649,9 +666,7 @@ void launch(
  T* kthValues = reinterpret_cast<T*>(kthValues_buffer.get());

  TORCH_CHECK(blocks_per_slice <= std::numeric_limits<uint32_t>::max(), "blocks_per_slice larger than uint32 maximum is not supported");
-  auto semaphores_buffer = allocator.allocate(numInputSlices * sizeof(uint32_t));
-  uint32_t* semaphores = reinterpret_cast<uint32_t*>(semaphores_buffer.get());
-  AT_CUDA_CHECK(cudaMemsetAsync(semaphores, 0, numInputSlices * sizeof(uint32_t), stream));
+

  auto ks_to_find_buffer = allocator.allocate(2 * numInputSlices * sizeof(uint32_t));
  uint32_t* ks_to_find = reinterpret_cast<uint32_t*>(ks_to_find_buffer.get());
@ -668,6 +683,10 @@ void launch(
  static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits<short>::max(),
    "blockwise counter too large");

+  auto digit_cum_sum_buffer = allocator.allocate(numInputSlices * RADIX_DIGITS * sizeof(uint32_t));
+  uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get());
+  AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream));
+
 #if CUB_SUPPORTS_SCAN_BY_KEY()
  auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
  uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
@ -691,7 +710,7 @@ void launch(

  // iterate radix bits for multiple passes
  for (int current_bit = sizeof(T) * 8 - RADIX_BITS; current_bit >= 0; current_bit -= RADIX_BITS) {
-    radixFindKthValues<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
+    computeBlockDigitCounts<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
        input,
        inputSliceSize,
        ks_to_find_in, // unused arg
@ -704,10 +723,14 @@ void launch(
        desired_in,
        counts);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+    computeDigitCumSum<<<numInputSlices, RADIX_DIGITS, 0, stream>>>(counts, digit_cum_sum, blocks_per_slice);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
    // we unconditionally call this kernel to update desired/ks_to_find/kthValues
    // if cub supports scan_by_key we additionally do k counts
    computeBlockwiseWithinKCounts<Bitwise, T><<<grid, RADIX_DIGITS, 0, stream>>>(
-      desired_in, counts, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
+      desired_in, counts, digit_cum_sum, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    // swap desired/ks_to_find in and out for next iter
    auto tmp_desired = desired_in;
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@ -1107,10 +1107,14 @@ void ldl_factor_kernel(
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Cusolver:
-      return ldl_factor_cusolver(
+       { ldl_factor_cusolver(
          LD, pivots, info, upper, hermitian);
+        return;
+}
    case at::LinalgBackend::Magma:
-      return ldl_factor_magma(LD, pivots, info, upper, hermitian);
+       { ldl_factor_magma(LD, pivots, info, upper, hermitian);
+        return;
+}
    default:
    // By default use cusolver if available and magma otherwise.
    // If cusolver and magma 2.5.4+ are both available and hermitian=true,
@ -1122,8 +1126,10 @@ void ldl_factor_kernel(
            LD, pivots, info, upper, hermitian);
      }
 #endif
-      return ldl_factor_cusolver(
-          LD, pivots, info, upper, hermitian);
+    { ldl_factor_cusolver(
+      LD, pivots, info, upper, hermitian);
+      return;
+    }
 #else
      return ldl_factor_magma(LD, pivots, info, upper, hermitian);
 #endif
@ -1839,11 +1845,14 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
      // For the benchmarks see
      // https://github.com/pytorch/pytorch/pull/56253#discussion_r622851107
      if (input.size(-2) <= 256 && batchCount(input) >= std::max<int64_t>(2, input.size(-2) / 16)) {
-        return geqrf_batched_cublas(input, tau);
+        geqrf_batched_cublas(input, tau);
+        return;
      } else {
-        return geqrf_cusolver(input, tau);
+        geqrf_cusolver(input, tau);
+        return;
      }
-      return geqrf_batched_cublas(input, tau);
+      geqrf_batched_cublas(input, tau);
+      return;
  };

  auto preferred_backend = at::globalContext().linalgPreferredBackend();
@ -1856,10 +1865,14 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
  // - ?geqrf_gpu allows fast computation of Q via ?orgqr_gpu, but doesn't give R properly.
  // - ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orgqr_gpu
    case at::LinalgBackend::Magma:
-      return geqrf_magma(input, tau);
+      { geqrf_magma(input, tau);
+        return;
+      }
    case at::LinalgBackend::Cusolver:
    default:
-      return geqrf_cusolver_backend(input, tau);
+      { geqrf_cusolver_backend(input, tau);
+        return;
+      }
  }
 #else
  return geqrf_magma(input, tau);
@ -2703,13 +2716,17 @@ void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Magma:
-      return gels_magma(a, b, infos);
+      { gels_magma(a, b, infos);
+        return;
+      }
    case at::LinalgBackend::Cusolver:
    default:
      // linalg_lstsq_gels is a generic function that is implemented using
      // geqrf_stub, ormqr_stub, and triangular_solve_stub
      // It dispatches to cuSOLVER for CUDA inputs if USE_LINALG_SOLVER is defined
-      return linalg_lstsq_gels(a, b, infos);
+      { linalg_lstsq_gels(a, b, infos);
+        return;
+      }
  }
 #else
  return gels_magma(a, b, infos);
--- a/aten/src/ATen/native/cudnn/Conv_v8.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@ -337,8 +337,7 @@ struct BenchmarkCache {
            engine_cache_order.begin(), engine_cache_order, it->second.second);
      }
    } else {
-      engine_cache.erase(key);
-      engine_cache.emplace(
+      engine_cache.insert_or_assign(
          key,
          std::make_pair(results, engine_cache_order.end())); // dummy iterator
    }
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -371,8 +371,7 @@ struct MHAGraphCache {
  }

  void update(const KeyType& key, T& results) {
-    engine_cache.erase(key);
-    engine_cache.emplace(key, std::move(results));
+    engine_cache.insert_or_assign(key, std::move(results));
  }
 };

--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@ -1222,7 +1222,7 @@ cudnnRNNAlgo_t get_algo(
 }

 cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
-  if (dtype == CUDNN_DATA_HALF) {
+  if (dtype == CUDNN_DATA_HALF || dtype == CUDNN_DATA_BFLOAT16) {
    return CUDNN_DATA_FLOAT;
  }
  return dtype;
--- a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
@ -772,13 +772,21 @@ void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {

 template <>
 void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  std::string_view arch(dprops->gcnArchName);
-  if (arch == "gfx1100") {
+  static const std::vector<std::string> wmma_archs = {
+    "gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
+#if ROCM_VERSION >= 70000
+    "gfx1150", "gfx1151"
+#endif
+  };
+  if (at::detail::getCUDAHooks().isGPUArch(wmma_archs)) {
    dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGS(at::BFloat16));
-  } else{
+  }
+  else if (at::detail::getCUDAHooks().isGPUArch({"gfx9"})) {
    dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
  }
+  else {
+    TORCH_CHECK(false, "gemm_internal_ck<at::BFloat16> unsupported gfx arch");
+  }
 }

 } // namespace at::native
--- a/aten/src/ATen/native/hip/ck_gemm_half.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_half.hip
@ -599,11 +599,21 @@ void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {

 template <>
 void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
-  if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) {
+  static const std::vector<std::string> wmma_archs = {
+    "gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
+#if ROCM_VERSION >= 70000
+    "gfx1150", "gfx1151"
+#endif
+  };
+  if (at::detail::getCUDAHooks().isGPUArch(wmma_archs)) {
    dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGS(at::Half));
-  } else{
+  }
+  else if (at::detail::getCUDAHooks().isGPUArch({"gfx9"})) {
    dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
  }
+  else {
+    TORCH_CHECK(false, "gemm_internal_ck<at::Half> unsupported gfx arch");
+  }
 }

 } // namespace at::native
--- a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
@ -373,59 +373,67 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return addmm_dense_result(
+        addmm_dense_result(
            mat2.transpose(0, 1).to_sparse_csr(),
            mat1.transpose(0, 1),
            beta,
            alpha,
            result.transpose(0, 1));
+            return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(
+        addmm_dense_result(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
            result.transpose(-2, -1));
+            return;
      }
    }
    if (mat2.layout() == kSparseBsc) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(
+        addmm_dense_result(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
            result.transpose(-2, -1));
+            return;
      }
    }
  }
  if (mat1.layout() == kSparseCsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_dense_result(mat1, mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
-        return addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
+        return;
      }
      if (result.layout() == kSparseCsr) {
-        return addmm_sparse_result(mat1, mat2, beta, alpha, result);
+        addmm_sparse_result(mat1, mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
        // TODO: CSR @ CSC kernel would be very fast due to format alignment
-        return addmm_sparse_input_dense_result(
-            mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_input_dense_result(
+          mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        return;
      }
      if (result.layout() == kSparseCsr) {
        // TODO: CSR @ CSC kernel would be very fast due to format alignment
-        return addmm_sparse_result(
-            mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_result(
+          mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        return;
      }
    }
  }
@ -433,56 +441,62 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
        // TODO: avoid csc->csr conversion with native csc support
-        return addmm_dense_result(
-            mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        addmm_dense_result(
+          mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr) {
        // TODO: avoid csc->csr conversion with native csc support
-        return addmm_sparse_result(
-            mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        addmm_sparse_result(
+          mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        return addmm_sparse_input_dense_result(
-            mat2.transpose(-2, -1),
-            mat1.transpose(-2, -1),
-            beta,
-            alpha,
-            result.transpose(-2, -1));
+        addmm_sparse_input_dense_result(
+          mat2.transpose(-2, -1),
+          mat1.transpose(-2, -1),
+          beta,
+          alpha,
+          result.transpose(-2, -1));
+        return;
      }
      if (result.layout() == kSparseCsr) {
        // TODO avoid csc->csr
-        return addmm_sparse_result(
-            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_result(
+          mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        return;
      }
      if (result.layout() == kSparseCsc) {
-        return addmm_sparse_result(
-            mat2.transpose(-2, -1),
-            mat1.transpose(-2, -1),
-            beta,
-            alpha,
-            result.transpose(-2, -1));
+        addmm_sparse_result(
+          mat2.transpose(-2, -1),
+          mat1.transpose(-2, -1),
+          beta,
+          alpha,
+          result.transpose(-2, -1));
+        return;
      }
    }
  }
  if (mat1.layout() == kSparseBsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_dense_result(mat1, mat2, beta, alpha, result);
+        return;
      }
    }
  }
  TORCH_CHECK(
-      false,
-      "addmm: computation on CPU is not implemented for ",
-      result.layout(),
-      " + ",
-      mat1.layout(),
-      " @ ",
-      mat2.layout());
+    false,
+    "addmm: computation on CPU is not implemented for ",
+    result.layout(),
+    " + ",
+    mat1.layout(),
+    " @ ",
+    mat2.layout());
 }

 /*
@ -496,16 +510,16 @@ void addmm_out_sparse_csr(
               [out] result of the operation.
 */
 void addmv_out_sparse_csr(
-    const Tensor& mat,
-    const Tensor& vec,
-    const Scalar& beta,
-    const Scalar& alpha,
-    const Tensor& result) {
+  const Tensor& mat,
+  const Tensor& vec,
+  const Scalar& beta,
+  const Scalar& alpha,
+  const Tensor& result) {
 #if !AT_USE_MKL_SPARSE()
  TORCH_CHECK(
-      false,
-      "Calling addmv on a sparse CPU tensor requires Linux platform. ",
-      "Please use PyTorch built with MKL on Linux.");
+    false,
+    "Calling addmv on a sparse CPU tensor requires Linux platform. ",
+    "Please use PyTorch built with MKL on Linux.");
 #else
  c10::MaybeOwned<Tensor> result_ = prepare_dense_vector_for_mkl(result);
  c10::MaybeOwned<Tensor> vec_ = prepare_dense_vector_for_mkl(vec);
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -5,38 +5,6 @@
 # representing ScalarType's. They are now superseded by usage of
 # `aten::to()`. The ops remain here for backward compatibility purposes.

-# DEPRECATED. DO NOT USE
- func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
 # Computes the gradient of current tensor w.r.t. graph leaves.
 - func: _backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()
  manual_cpp_binding: True
@ -6725,12 +6693,12 @@

 - func: native_norm(Tensor self, Scalar p=2) -> Tensor
  dispatch:
-    SparseCPU, SparseCUDA: norm_sparse
+    SparseCPU, SparseCUDA, SparseMPS: norm_sparse
  autogen: native_norm.out

 - func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor
  dispatch:
-    SparseCPU, SparseCUDA: norm_sparse
+    SparseCPU, SparseCUDA, SparseMPS: norm_sparse
  autogen: native_norm.ScalarOpt_dim_dtype_out

 - func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
@ -6856,14 +6824,14 @@
  device_check: NoCheck   # TensorIterator
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA: sparse_dtype_norm
+    SparseCPU, SparseCUDA, SparseMPS: sparse_dtype_norm

 - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
  structured_delegate: norm.out
  device_check: NoCheck   # TensorIterator
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA: sparse_norm
+    SparseCPU, SparseCUDA, SparseMPS: sparse_norm

 - func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
  structured: True
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
@ -810,7 +810,8 @@ void addmm_out_sparse_csr(
  if (mat1.layout() == kSparseBsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided)
-        return block_sparse_mm(input, mat1, mat2, beta, alpha, result);
+         { block_sparse_mm(input, mat1, mat2, beta, alpha, result); return;
+}
    }
  }

@ -819,13 +820,13 @@ void addmm_out_sparse_csr(
      if (result.layout() == kStrided) {
        auto result_t = result.transpose(-2, -1);
        auto input_t = (result.is_same(input) ? result_t : input.transpose(-2, -1));
-        return block_sparse_mm(
+        block_sparse_mm(
            input_t,
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result_t);
+            result_t); return;
      }
    }
  }
@ -840,41 +841,41 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spmm(
+        spmm(
            mat2.transpose(0, 1).to_sparse_csr(),
            mat1.transpose(0, 1),
            beta,
            alpha,
-            result.transpose(0, 1));
+            result.transpose(0, 1)); return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        return spmm(
+        spmm(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result.transpose(-2, -1));
+            result.transpose(-2, -1)); return;
      }
    }
  }
  if (mat1.layout() == kSparseCsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        return spmm(mat1, mat2, beta, alpha, result);
+        spmm(mat1, mat2, beta, alpha, result); return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr) {
-        return spgemm(mat1, mat2, beta, alpha, result);
+        spgemm(mat1, mat2, beta, alpha, result); return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kSparseCsr) {
        // TODO: Add native CSC support via cuSPARSE if supported.
        // CSR @ CSC kernel would be very fast due to format alignment
-        return spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result); return;
      }
    }
  }
@ -882,27 +883,28 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr)
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
+         { spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
+}
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kSparseCsr) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spgemm(
-            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        spgemm(
+            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result); return;
      }
      if (result.layout() == kSparseCsc) {
-        return spgemm(
+        spgemm(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result.transpose(-2, -1));
+            result.transpose(-2, -1)); return;
      }
    }
  }
@ -933,7 +935,7 @@ void addmv_out_sparse_csr(
    const Scalar& alpha,
    const Tensor& result) {
  if (mat.layout() == kSparseBsr) {
-    return block_sparse_mv(mat, vec, beta, alpha, result);
+    block_sparse_mv(mat, vec, beta, alpha, result); return;
  }
  cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;

@ -1213,9 +1215,9 @@ void triangular_solve_out_sparse_csr(
  }
  if (A.layout() == kSparseBsr) {
    if (B.size(-1) == 1) {
-      return block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular);
+      block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular); return;
    } else {
-      return block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular);
+      block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular); return;
    }
  }
 #ifdef USE_ROCM
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
@ -117,7 +117,7 @@ class FwdKernel:
    def get_all(cls) -> list["FwdKernel"]:
        kernels: list[FwdKernel] = []
        for aligned, dtype, (sm, sm_max) in itertools.product(
-            [True, False], DTYPES.keys(), zip(SM, SM[1:])
+            [True, False], DTYPES.keys(), itertools.pairwise(SM)
        ):
            # Remove some kernels we don't use
            if dtype == "bf16" and sm < 80:
@ -228,7 +228,7 @@ class BwdKernel:
        for aligned, dtype, (sm, sm_max), apply_dropout, max_k in itertools.product(
            [True, False],
            DTYPES.keys(),
-            zip(SM, SM[1:]),
+            itertools.pairwise(SM),
            [True, False],
            [32, 64, 128, 2**16],
        ):
--- a/benchmarks/distributed/bench_nvshmem_tile_reduce.py
+++ b/benchmarks/distributed/bench_nvshmem_tile_reduce.py
@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""
+Benchmark for NVSHMEM tile reduce operations.
+
+Usage:
+python benchmarks/distributed/bench_nvshmem_tile_reduce.py
+
+This benchmark measures the performance of tile reduce operations across different
+matrix sizes and tile configurations.
+"""
+
+import time
+
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+from torch.testing._internal.common_distributed import MultiProcContinuousTest
+from torch.testing._internal.common_utils import (
+    requires_cuda_p2p_access,
+    skip_but_pass_in_sandcastle_if,
+    skipIfRocm,
+)
+
+
+# Decorator
+def requires_nvshmem():
+    return skip_but_pass_in_sandcastle_if(
+        not symm_mem.is_nvshmem_available(),
+        "bench_nvshmem_tile_reduce requires NVSHMEM, skipping benchmark",
+    )
+
+
+# So that benchmarks are written in device-agnostic way
+device_type = "cuda"
+device_module = torch.get_device_module(device_type)
+
+
+@requires_nvshmem()
+@requires_cuda_p2p_access()
+class NVSHMEMTileReduceBenchmark(MultiProcContinuousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+        # Set NVSHMEM as SymmMem backend
+        symm_mem.set_backend("NVSHMEM")
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    def _benchmark_tile_reduce_single(
+        self,
+        full_size: int,
+        tile_size: int,
+        warmup_iters: int = 5,
+        bench_iters: int = 10,
+    ) -> dict:
+        """
+        Benchmark a single configuration of tile reduce.
+
+        Args:
+            full_size: Size of the full matrix (full_size x full_size)
+            warmup_iters: Number of warmup iterations
+            bench_iters: Number of benchmark iterations
+
+        Returns:
+            Dictionary with benchmark results
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+
+        # Allocate full matrices
+        full_inp = symm_mem.empty(
+            full_size, full_size, dtype=dtype, device=self.device
+        ).fill_(self.rank)
+        full_out = symm_mem.empty(
+            full_size, full_size, dtype=dtype, device=self.device
+        ).fill_(0)
+
+        slice_ut = slice(0, tile_size)
+        inp_tile = full_inp[slice_ut, slice_ut]
+        out_tile = full_out[slice_ut, slice_ut]
+
+        root = 0
+
+        # Warmup iterations
+        for _ in range(warmup_iters):
+            torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
+            torch.cuda.synchronize(self.device)
+
+        # Benchmark iterations
+        times = []
+
+        dist.barrier()
+        torch.cuda.synchronize(self.device)
+        start_time = time.perf_counter()
+
+        for _ in range(bench_iters):
+            torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
+
+        torch.cuda.synchronize(self.device)
+        end_time = time.perf_counter()
+        times.append((end_time - start_time) / bench_iters)
+
+        # Calculate statistics
+        times = torch.tensor(times, dtype=torch.float64)
+        tile_elements = tile_size * tile_size
+        tile_bytes = (
+            tile_elements * dtype.itemsize
+            if hasattr(dtype, "itemsize")
+            else tile_elements * 4
+        )
+
+        results = {
+            "full_size": full_size,
+            "tile_size": tile_size,
+            "tile_elements": tile_elements,
+            "tile_bytes": tile_bytes,
+            "world_size": self.world_size,
+            "mean_time_ms": times.mean().item() * 1000,
+            "std_time_ms": times.std().item() * 1000,
+            "min_time_ms": times.min().item() * 1000,
+            "max_time_ms": times.max().item() * 1000,
+            "throughput_gb_s": tile_bytes / (times.mean().item() * 1e9),
+            "elements_per_sec": tile_elements / times.mean().item(),
+        }
+
+        return results
+
+    @skipIfRocm
+    def test_benchmark_tile_reduce_various_sizes(self) -> None:
+        """
+        Benchmark tile reduce across various matrix sizes.
+        """
+        # Test various matrix sizes
+        tile_sizes = [512, 1024, 2048, 4096, 8192, 16384]
+        full_size = tile_sizes[-1]
+        warmup_iters = 5
+        bench_iters = 20
+
+        results = []
+
+        for tile_size in tile_sizes:
+            try:
+                result = self._benchmark_tile_reduce_single(
+                    full_size, tile_size, warmup_iters, bench_iters
+                )
+                results.append(result)
+
+                if self.rank == 0:
+                    print(
+                        f"Matrix Size: {full_size}x{full_size}, Tile Size: {tile_size}x{tile_size}"
+                    )
+                    print(
+                        f"  Mean Time: {result['mean_time_ms']:.3f} ± {result['std_time_ms']:.3f} ms"
+                    )
+                    print(f"  Throughput: {result['throughput_gb_s']:.2f} GB/s")
+                    print(f"  Bytes: {result['tile_bytes']:.0f}")
+                    print()
+
+            except Exception as e:
+                if self.rank == 0:
+                    print(f"Failed to benchmark matrix size {full_size}: {e}")
+
+        # Print summary
+        if self.rank == 0 and results:
+            print("=== BENCHMARK SUMMARY ===")
+            print(
+                f"{'Matrix Size':<12} {'Tile Size':<10} {'Time (ms)':<12} {'Throughput (GB/s)':<18} {'Bytes':<15}"
+            )
+            print("-" * 70)
+
+            for result in results:
+                print(
+                    f"{result['full_size']}x{result['full_size']:<7} "
+                    f"{result['tile_size']}x{result['tile_size']:<5} "
+                    f"{result['mean_time_ms']:<12.3f} "
+                    f"{result['throughput_gb_s']:<18.2f} "
+                    f"{result['tile_bytes']:<15.0f}"
+                )
+
+
+if __name__ == "__main__":
+    # For standalone usage, you'd need to set up distributed environment
+    # For now, this is meant to be run via the PyTorch test framework
+    from torch.testing._internal.common_utils import run_tests
+
+    run_tests()
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@ -10,7 +10,6 @@ import pandas as pd

 flaky_models = {
    "yolov3",
-    "gluon_inception_v3",
    "detectron2_maskrcnn_r_101_c4",
    "timm_efficientnet",  # see https://github.com/pytorch/pytorch/issues/148699
    "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
@ -36,15 +35,11 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
            {
                "Background_Matting",
                "alexnet",
-                "cait_m36_384",
-                "dla102",
                "demucs",
                "densenet121",
                "detectron2_fcos_r_50_fpn",
                "doctr_det_predictor",
                "doctr_reco_predictor",
-                "dpn107",
-                "fbnetv3_b",
                "hf_BigBird",
                "hf_Longformer",
                "hf_Reformer",
@ -52,7 +47,6 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "hf_T5",
                "hf_T5_base",
                "hf_T5_generate",
-                "levit_128",
                "llava",
                "microbench_unbacked_tolist_sum",
                "mnasnet1_0",
@ -69,7 +63,6 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "squeezenet1_1",
                "stable_diffusion_text_encoder",
                "stable_diffusion_unet",
-                "swsl_resnext101_32x16d",
                "timm_efficientdet",
                "timm_efficientnet",
                "timm_nfnet",
--- a/benchmarks/dynamo/check_graph_breaks.py
+++ b/benchmarks/dynamo/check_graph_breaks.py
@ -10,7 +10,6 @@ import pandas as pd

 flaky_models = {
    "yolov3",
-    "gluon_inception_v3",
    "detectron2_maskrcnn_r_101_c4",
    "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
    "detectron2_fcos_r_50_fpn",
@ -32,7 +31,6 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
        flaky_models.update(
            {
                "alexnet",
-                "cait_m36_384",
                "demucs",
                "densenet121",
                "detectron2_fcos_r_50_fpn",
@ -44,7 +42,6 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                "hf_Roberta_base",
                "hf_T5",
                "hf_T5_base",
-                "levit_128",
                "llava",
                "microbench_unbacked_tolist_sum",
                "resnet50",
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,pass,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,7



-botnet26t_256,pass,6
-
-
-
-cait_m36_384,eager_fail_to_run,0
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
-
-
-
 deit_base_distilled_patch16_224,pass,7



-dla102,pass,7
-
-
-
 dm_nfnet_f0,pass,6



-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6



-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6



-jx_nest_base,pass,7
-
-
-
-lcnet_050,fail_accuracy,6
-
-
-
-levit_128,pass,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
 mobilenetv2_100,pass,7


@ -146,100 +42,16 @@ nfnet_l0,pass,7



-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
 repvgg_a2,pass,7



-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
-
-
-
 swin_base_patch4_window7_224,pass,7



-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6



-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
 visformer_small,pass,7
-
-
-
-vit_base_patch16_224,pass,7
-
-
-
-volo_d1_224,pass,7
-
-
-
-xcit_large_24_p8_224,pass_due_to_skip,7
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@ -6,58 +6,26 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,0


@ -66,10 +34,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -82,10 +46,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -94,10 +54,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -106,18 +62,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -126,26 +74,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,pass,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
@ -6,58 +6,26 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,0


@ -66,10 +34,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -82,10 +46,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -94,10 +54,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -106,18 +62,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -126,26 +74,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,pass,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,timeout,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,timeout,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,pass,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,pass,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,7



-botnet26t_256,pass,6
-
-
-
-cait_m36_384,eager_fail_to_run,0
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
-
-
-
 deit_base_distilled_patch16_224,pass,7



-dla102,pass,7
-
-
-
 dm_nfnet_f0,pass,6



-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6



-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6



-jx_nest_base,pass,7
-
-
-
-lcnet_050,fail_accuracy,6
-
-
-
-levit_128,pass,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
 mobilenetv2_100,pass,7


@ -146,100 +42,16 @@ nfnet_l0,pass,7



-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
 repvgg_a2,pass,7



-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
-
-
-
 swin_base_patch4_window7_224,pass,7



-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6



-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
 visformer_small,pass,7
-
-
-
-vit_base_patch16_224,pass,7
-
-
-
-volo_d1_224,pass,7
-
-
-
-xcit_large_24_p8_224,pass_due_to_skip,7
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,pass,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,pass,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,pass,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,fail_to_run,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,7



-botnet26t_256,pass,6
-
-
-
-cait_m36_384,eager_fail_to_run,0
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
-
-
-
 deit_base_distilled_patch16_224,pass,7



-dla102,pass,7
-
-
-
 dm_nfnet_f0,pass,6



-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6



-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6



-jx_nest_base,pass,7
-
-
-
-lcnet_050,pass,6
-
-
-
-levit_128,pass,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
 mobilenetv2_100,pass,7


@ -146,100 +42,16 @@ nfnet_l0,pass,7



-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
 repvgg_a2,pass,7



-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
-
-
-
 swin_base_patch4_window7_224,pass,7



-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6



-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
 visformer_small,pass,7
-
-
-
-vit_base_patch16_224,pass,7
-
-
-
-volo_d1_224,pass,7
-
-
-
-xcit_large_24_p8_224,pass_due_to_skip,7
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,pass,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,7



-botnet26t_256,pass,6
-
-
-
-cait_m36_384,eager_fail_to_run,0
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
-
-
-
 deit_base_distilled_patch16_224,pass,7



-dla102,pass,7
-
-
-
 dm_nfnet_f0,pass,6



-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6



-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6



-jx_nest_base,pass,7
-
-
-
-lcnet_050,pass,6
-
-
-
-levit_128,pass,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
 mobilenetv2_100,pass,7


@ -146,100 +42,16 @@ nfnet_l0,pass,7



-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
 repvgg_a2,pass,7



-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
-
-
-
 swin_base_patch4_window7_224,pass,7



-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6



-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
 visformer_small,pass,7
-
-
-
-vit_base_patch16_224,pass,7
-
-
-
-volo_d1_224,pass,7
-
-
-
-xcit_large_24_p8_224,pass_due_to_skip,7
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/Show More
+++ b/Show More