Add error handling for self.stack when byte exceeding limit

Fix CI on the max length conversion
Fix more byte output
2025-10-23 14:59:34 +08:00 · 2025-10-10 02:48:22 +00:00 · 2025-10-10 02:48:22 +00:00 · 2025-10-10 02:48:21 +00:00 · 2025-10-10 02:48:21 +00:00 · 2025-10-10 02:48:21 +00:00
730 changed files with 11480 additions and 13484 deletions
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -37,9 +37,9 @@ case ${DOCKER_TAG_PREFIX} in
  rocm*)
    BASE_TARGET=rocm
    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-    # add gfx950 conditionally starting in ROCm 7.0
+    # add gfx950, gfx115x conditionally starting in ROCm 7.0
    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
-        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
+        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
    fi
    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
    ;;
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -344,7 +344,7 @@ docker build \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
-       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx1100}" \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-27664085f804afc83df26f740bb46c365854f2c4
+7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -46,9 +46,9 @@ case ${DOCKER_TAG_PREFIX} in
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950 conditionally starting in ROCm 7.0
+        # add gfx950, gfx115x conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -115,6 +115,9 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
 # cmake-3.28.0 from pip for onnxruntime
 RUN python3 -mpip install cmake==3.28.0

+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
+
 # build onnxruntime 1.21.0 from sources.
 # it is not possible to build it from sources using pip,
 # so just build it from upstream repository.
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -84,9 +84,9 @@ case ${image} in
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950 conditionally starting in ROCm 7.0
+        # add gfx950, gfx115x conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -120,9 +120,8 @@ ninja==1.11.1.4
 numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
-#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
+#Pinned versions: 0.55.2, 0.60.0
 #test that import: test_numba_integration.py
-#For numba issue see https://github.com/pytorch/pytorch/issues/51511
 #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073

 #numpy
@ -242,10 +241,9 @@ pygments==2.15.0
 #Pinned versions: 14.1.0
 #test that import:

-scikit-image==0.19.3 ; python_version < "3.10"
-scikit-image==0.22.0 ; python_version >= "3.10"
+scikit-image==0.22.0
 #Description: image processing routines
-#Pinned versions:
+#Pinned versions: 0.22.0
 #test that import: test_nn.py

 #scikit-learn
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -5,7 +5,7 @@ DESIRED_ROCM ?= 7.0
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
-#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
+#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201

 DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@ -18,7 +18,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 .PHONY: all
 all: magma-rocm70
 all: magma-rocm64
-all: magma-rocm63

 .PHONY:
 clean:
@ -34,8 +33,3 @@ magma-rocm70:
 magma-rocm64: DESIRED_ROCM := 6.4
 magma-rocm64:
 	$(DOCKER_RUN)
-
-.PHONY: magma-rocm63
-magma-rocm63: DESIRED_ROCM := 6.3
-magma-rocm63:
-	$(DOCKER_RUN)
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -67,7 +67,7 @@ fi
 #       wheels with cxx11-abi

 echo "Checking that the gcc ABI is what we expect"
-if [[ "$(uname)" != 'Darwin' &&  "$(uname -m)" != "s390x" ]]; then
+if [[ "$(uname)" != 'Darwin' ]]; then
  # We also check that there are cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -256,7 +256,7 @@ test_torchbench_smoketest() {
  local device=mps
  local dtypes=(undefined float16 bfloat16 notset)
  local dtype=${dtypes[$1]}
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
+  local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)

  for backend in eager inductor; do

@ -319,7 +319,7 @@ test_aoti_torchbench_smoketest() {
  local device=mps
  local dtypes=(undefined float16 bfloat16 notset)
  local dtype=${dtypes[$1]}
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
+  local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)

  echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
  local dtype_arg="--${dtype}"
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -838,7 +838,7 @@ test_dynamo_benchmark() {
      elif [[ "${suite}" == "timm_models" ]]; then
        export TORCHBENCH_ONLY_MODELS="inception_v3"
      elif [[ "${suite}" == "torchbench" ]]; then
-        export TORCHBENCH_ONLY_MODELS="hf_Bert"
+        export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
      fi
    fi
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
@ -869,13 +869,13 @@ test_inductor_torchbench_smoketest_perf() {
  mkdir -p "$TEST_REPORTS_DIR"

  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
-    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
+    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \
    --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
  # The threshold value needs to be actively maintained to make this check useful
  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4

  # Check memory compression ratio for a few models
-  for test in hf_Albert timm_vision_transformer; do
+  for test in BERT_pytorch yolov3; do
    python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
      --disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
      --only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
@ -886,7 +886,7 @@ test_inductor_torchbench_smoketest_perf() {
  done

  # Perform some "warm-start" runs for a few huggingface models.
-  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+  for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
    python benchmarks/dynamo/check_accuracy.py \
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,7 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 # TODO: Move this to .ci/docker/requirements-ci.txt
-python -m pip install "psutil==5.9.1" "pynvml==11.4.1" "pytest-shard==0.1.2"
+python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2"

 run_tests() {
    # Run nvidia-smi if available
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -71,14 +71,7 @@ export PYTORCH_BUILD_NUMBER=1

 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
-
-# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
-TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
-
-# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
-if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
-  TRITON_CONSTRAINT="platform_system == 'Linux'"
-fi
+TRITON_CONSTRAINT="platform_system == 'Linux'"

 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@ -28,6 +28,10 @@ runs:
        echo "instance-type: $(get_ec2_metadata instance-type)"
        echo "system info $(uname -a)"

+    - name: Print GPU info (if present)
+      shell: bash
+      run: if [ -f /usr/bin/nvidia-smi ]; then nvidia-smi; fi
+
    - name: Check if in a container runner
      shell: bash
      id: check_container_runner
@ -82,37 +86,6 @@ runs:
        # Prune all of the docker images
        docker system prune -af

-    - name: Manually resolve download.pytorch.org
-      shell: bash
-      continue-on-error: true
-      run: |
-        set +e
-        set -x
-
-        PT_DOMAIN=download.pytorch.org
-        # TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400,
-        # cleaning this up once the issue is fixed. There are more than one resolved IP here, the last
-        # one is returned at random
-        RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1)
-
-        if [ -z "${RESOLVED_IP}" ]; then
-          echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..."
-          RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1)
-
-          if [ -z "${RESOLVED_IP}" ]; then
-            echo "Couldn't resolve ${PT_DOMAIN}, exiting..."
-            exit 1
-          fi
-        fi
-
-        if grep -r "${PT_DOMAIN}" /etc/hosts; then
-          # Clean up any old records first
-          sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts
-        fi
-
-        echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
-        cat /etc/hosts
-
    - name: Check that the docker daemon is running
      shell: bash
      continue-on-error: true
--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -18,6 +18,7 @@ class GitHubComment:
    body_text: str
    created_at: str
    author_login: str
+    author_url: Optional[str]
    author_association: str
    editor_login: Optional[str]
    database_id: int
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/test_check_labels.py
+++ b/.github/scripts/test_check_labels.py
@ -38,6 +38,7 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text="mock_body_text",
            created_at="",
            author_login="",
+            author_url=None,
            author_association="",
            editor_login=None,
            database_id=1,
@ -48,6 +49,7 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
            created_at="",
            author_login=BOT_AUTHORS[1],
+            author_url=None,
            author_association="",
            editor_login=None,
            database_id=2,
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -32,6 +32,7 @@ from trymerge import (
    main as trymerge_main,
    MandatoryChecksMissingError,
    MergeRule,
+    PostCommentError,
    RE_GHSTACK_DESC,
    read_merge_rules,
    remove_job_name_suffix,
@ -588,6 +589,23 @@ class TestTryMerge(TestCase):
            self.assertEqual(mock_merge_base, pr.get_merge_base())
            mocked_gh_fetch_merge_base.assert_called_once()

+    def test_app_can_revert(self, *args: Any) -> None:
+        pr = GitHubPR("pytorch", "pytorch", 164660)
+        repo = DummyGitRepo()
+        app_comment_id, impostor_comment_id = 3375785595, 3377647892
+        # Check that app can revert
+        self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id))
+        # But impostor can not
+        self.assertRaises(
+            PostCommentError,
+            lambda: validate_revert(repo, pr, comment_id=impostor_comment_id),
+        )
+        # Despite it's name being the name of the bot
+        self.assertEqual(
+            pr.get_comment_by_id(impostor_comment_id).author_login,
+            "pytorch-auto-revert",
+        )
+

@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -234,6 +234,7 @@ query ($owner: String!, $name: String!, $number: Int!) {
          createdAt
          author {
            login
+            url
          }
          authorAssociation
          editor {
@ -1093,6 +1094,7 @@ class GitHubPR:
            body_text=node["bodyText"],
            created_at=node["createdAt"] if "createdAt" in node else "",
            author_login=node["author"]["login"],
+            author_url=node["author"].get("url", None),
            author_association=node["authorAssociation"],
            editor_login=editor["login"] if editor else None,
            database_id=node["databaseId"],
@ -2029,6 +2031,11 @@ def validate_revert(
    # For some reason, one can not be a member of private repo, only CONTRIBUTOR
    if pr.is_base_repo_private():
        allowed_reverters.append("CONTRIBUTOR")
+    # Special case the pytorch-auto-revert app, whose does not have association
+    # But should be able to issue revert command
+    if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
+        allowed_reverters.append("NONE")
+
    if author_association not in allowed_reverters:
        raise PostCommentError(
            f"Will not revert as @{author_login} is not one of "
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100

 on:
  schedule:
-    - cron: 15 0,12 * * 1-6
+    - cron: 15 0 * * 1-6
    - cron: 0 7 * * 0
  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@ -63,6 +63,7 @@ jobs:
      # Same as the build job
      python-version: 3.12.7
      test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
+      timeout-minutes: 300
      disable-monitor: false
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -59,3 +59,29 @@ jobs:
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
+
+  linux-jammy-rocm-py3_10-gfx1100-test:
+    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3_10-gfx1100
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
+        ]}
+      tests-to-include: >
+         test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
+         test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
+         inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
+         inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
+         inductor/test_flex_attention inductor/test_max_autotune
+    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -88,7 +88,7 @@ torch_compile_debug/
 # Listed manually because some files in this directory are not generated
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
-torch/csrc/api/include/torch/version.h
+torch/headeronly/version.h
 torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -28,7 +28,7 @@ exclude_patterns = [
    'torch/lib/**',
    'venv/**',
    '**/*.pyi',
-    "tools/experimental/dynamic_shapes/torchfuzz/**",
+    "tools/experimental/torchfuzz/**",
    'tools/test/test_selective_build.py',
 ]
 command = [
@ -198,7 +198,7 @@ exclude_patterns = [
    'tools/test/gen_operators_yaml_test.py',
    'tools/test/gen_oplist_test.py',
    'tools/test/test_selective_build.py',
-    'tools/experimental/dynamic_shapes/torchfuzz/**',
+    'tools/experimental/torchfuzz/**',
 ]
 command = [
    'python3',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -13,6 +13,9 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt
 load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
 load("//:tools/bazel.bzl", "rules")

+# Export files for use by torch/headeronly (where version.h generation now lives)
+exports_files(["version.txt"])
+
 define_targets(rules = rules)

 COMMON_COPTS = [
@ -690,7 +693,9 @@ cc_library(
            "torch/csrc/*/generated/*.h",
            "torch/csrc/jit/serialization/mobile_bytecode_generated.h",
        ] + torch_cuda_headers,
-    ) + GENERATED_AUTOGRAD_CPP + [":version_h"],
+    ) + GENERATED_AUTOGRAD_CPP + [
+        "//torch/headeronly:version_h",
+    ],
    includes = [
        "third_party/kineto/libkineto/include",
        "torch/csrc",
--- a/2
+++ b/2
@ -53,7 +53,7 @@ ARG CUDA_PATH=cu121
 ARG INSTALL_CHANNEL=whl/nightly
 # Automatically set by buildx
 # pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
-RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=25.7.0
+RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0

 ARG TARGETPLATFORM

--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@ -144,8 +144,7 @@ inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
 inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
  checkDeviceType("CPU_tensor_apply", tensors, kCPU);
  checkLayout("CPU_tensor_apply", tensors, kStrided);
-  if (!_all_equal_numel(tensors))
-    TORCH_CHECK(false, _all_equal_numel_error(tensors));
+  TORCH_CHECK(_all_equal_numel(tensors), _all_equal_numel_error(tensors));
  // An empty tensor has no elements
  for (auto& t : tensors)
    if (t.numel() == 0)
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -483,8 +483,8 @@ at::BlasBackend Context::blasPreferredBackend() {
 #if ROCM_VERSION >= 60300
          "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
 #endif
-#if ROCM_VERSION >= 60500
-          "gfx950"
+#if ROCM_VERSION >= 70000
+          "gfx950", "gfx1150", "gfx1151"
 #endif
      };
      for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
@ -587,20 +587,33 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
  rocm_fa_preferred_backend = b;
 }

-bool Context::allowFP16ReductionCuBLAS() const {
+CuBLASReductionOption Context::allowFP16ReductionCuBLAS() const {
  return allow_fp16_reduction_cublas;
 }

-void Context::setAllowFP16ReductionCuBLAS(bool b) {
-  allow_fp16_reduction_cublas = b;
+CuBLASReductionOption inline get_reduction_option(bool allow_reduced_precision, bool allow_splitk) {
+  TORCH_CHECK(
+      !(allow_reduced_precision && !allow_splitk),
+      "allow_splitk=False is not supported when reduced precision reductions are enabled");
+  if (allow_reduced_precision) {
+    return CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
+  } else if (allow_splitk) {
+    return CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK;
+  } else {
+    return CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK;
+  }
 }

-bool Context::allowBF16ReductionCuBLAS() const {
+void Context::setAllowFP16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
+  allow_fp16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
+}
+
+CuBLASReductionOption Context::allowBF16ReductionCuBLAS() const {
  return allow_bf16_reduction_cublas;
 }

-void Context::setAllowBF16ReductionCuBLAS(bool b) {
-  allow_bf16_reduction_cublas = b;
+void Context::setAllowBF16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
+  allow_bf16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
 }

 bool Context::allowFP16AccumulationCuBLAS() const {
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -38,6 +38,12 @@ namespace at {
 class Tensor;

 enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
+
+enum class CuBLASReductionOption : uint8_t {
+  AllowReducedPrecisionWithSplitK = 0,
+  DisallowReducedPrecisionAllowSplitK = 1,
+  DisallowReducedPrecisionDisallowSplitK = 2,
+};
 enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN };
 enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL };
 enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 };
@ -357,10 +363,14 @@ class TORCH_API Context {
  void setAllowTF32CuBLAS(bool);
  Float32MatmulPrecision float32MatmulPrecision() const;
  Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
-  bool allowFP16ReductionCuBLAS() const;
-  void setAllowFP16ReductionCuBLAS(bool);
-  bool allowBF16ReductionCuBLAS() const;
-  void setAllowBF16ReductionCuBLAS(bool);
+  CuBLASReductionOption allowFP16ReductionCuBLAS() const;
+  void setAllowFP16ReductionCuBLAS(
+      bool allow_reduced_precision,
+      bool allow_splitk = true);
+  CuBLASReductionOption allowBF16ReductionCuBLAS() const;
+  void setAllowBF16ReductionCuBLAS(
+      bool allow_reduced_precision,
+      bool allow_splitk = true);
  bool allowFP16AccumulationCuBLAS() const;
  void setAllowFP16AccumulationCuBLAS(bool);

@ -452,8 +462,10 @@ class TORCH_API Context {
      : at::Float32MatmulPrecision::HIGHEST;
  int benchmark_limit_cudnn = 10;
  bool allow_tf32_cudnn = true;
-  bool allow_fp16_reduction_cublas = true;
-  bool allow_bf16_reduction_cublas = true;
+  CuBLASReductionOption allow_fp16_reduction_cublas =
+      CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
+  CuBLASReductionOption allow_bf16_reduction_cublas =
+      CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
  bool allow_fp16_accumulation_cublas = false;
  std::optional<int32_t> sm_carveout = std::nullopt;
  bool enabled_mkldnn = true;
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -229,14 +229,14 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
  }

  void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef<int64_t> size) {
-    return _resize_(sparse_dim, dense_dim, size);
+    _resize_(sparse_dim, dense_dim, size);
  }

  void resize_(
      int64_t sparse_dim,
      int64_t dense_dim,
      ArrayRef<c10::SymInt> size) {
-    return _resize_(sparse_dim, dense_dim, size);
+    _resize_(sparse_dim, dense_dim, size);
  }

  // NOTE: this function will resize the sparse tensor and also set `indices`
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@ -59,7 +59,7 @@ static inline void set_item(const Tensor& self, ArrayRef<TensorIndex> indices, c
    }
  }

-  return set_item(self, indices, value);
+  set_item(self, indices, value);
 }

 } // namespace indexing
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@ -765,7 +765,8 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
  if (numel == 0) {
    return;
  } else if (numel < grain_size || at::get_num_threads() == 1) {
-    return serial_for_each(loop, {0, numel});
+    serial_for_each(loop, {0, numel});
+    return;
  } else {
    at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
      serial_for_each(loop, {begin, end});
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@ -49,7 +49,7 @@ static void check_unique_names(DimnameList names) {
 }

 void check_names_valid_for(const TensorBase& tensor, DimnameList names) {
-  return impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
+  impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
 }

 void check_names_valid_for(size_t tensor_dim, DimnameList names) {
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -138,7 +138,7 @@ void Tensor::_backward(TensorList inputs,
        const std::optional<Tensor>& gradient,
        std::optional<bool> keep_graph,
        bool create_graph) const {
-  return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
+  impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
 }

 const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
@ -173,4 +173,12 @@ unsigned TensorBase::_register_hook(std::function<TensorBase(const TensorBase&)>
  return impl::GetVariableHooks()->_register_hook(*this, std::move(hook));
 }

+std::optional<ScalarType> TensorBase::grad_dtype() const {
+  return impl::GetVariableHooks()->grad_dtype(*this);
+}
+
+void TensorBase::set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const {
+  return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype);
+}
+
 } // namespace at
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -930,6 +930,10 @@ public:

  const TensorBase& requires_grad_(bool _requires_grad=true) const;

+  std::optional<ScalarType> grad_dtype() const;
+
+  void set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const;
+
  // View Variables
  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@ -68,6 +68,8 @@ struct TORCH_API VariableHooksInterface {
      const c10::OperatorHandle& op,
      c10::DispatchKeySet dispatch_keys,
      torch::jit::Stack* stack) const = 0;
+  virtual std::optional<c10::ScalarType> grad_dtype(const TensorBase&) const = 0;
+  virtual void set_grad_dtype(const TensorBase&, const std::optional<c10::ScalarType>&) const = 0;
 };

 TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -496,7 +496,7 @@ class TORCH_API OperatorHandle {
  }

  void checkInvariants() const {
-    return operatorDef_->op.checkInvariants();
+    operatorDef_->op.checkInvariants();
  }

  c10::ArrayRef<at::Tag> getTags() const {
@ -932,7 +932,7 @@ inline void Dispatcher::redispatchBoxed(
  }
 #endif
  const auto& kernel = entry.lookup(dispatchKeySet);
-  return kernel.callBoxed(op, dispatchKeySet, stack);
+  kernel.callBoxed(op, dispatchKeySet, stack);
 }

 } // namespace c10
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -422,18 +422,34 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
    abType = CUDA_R_16F;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
 #ifndef USE_ROCM
-    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
-      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
-        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
+    if (fp16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      uint32_t mask =
+          fp16_reduction ==
+                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
+              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
+                 CUBLASLT_REDUCTION_SCHEME_NONE)
+              : CUBLASLT_REDUCTION_SCHEME_NONE;
+      preference.setAttribute(
+          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
    }
 #endif
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
    abType = CUDA_R_16BF;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
 #ifndef USE_ROCM
-    if (!at::globalContext().allowBF16ReductionCuBLAS()) {
-      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
-        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
+    if (bf16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      uint32_t mask =
+          bf16_reduction ==
+                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
+              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
+                 CUBLASLT_REDUCTION_SCHEME_NONE)
+              : CUBLASLT_REDUCTION_SCHEME_NONE;
+      preference.setAttribute(
+          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
    }
 #endif
  } else {
@ -1120,8 +1136,15 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
  }
  if (prop->major >= 5) {
    cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
-    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
-      cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
+    TORCH_CHECK(fp16_reduction !=
+        at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
+          "torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
+          "..., allow_splitk=False) requires the cuBLASLt backend");
+    if (fp16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      cublas_flags = static_cast<cublasMath_t>(
+          cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
    }
    // Disallow fp16 reductions that could lead to unexpected overflow issues.
    TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
@ -1180,8 +1203,15 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT
  GEMM_CHECK_ARGVALUES(at::BFloat16);
 #ifndef USE_ROCM
  cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
-  if (!at::globalContext().allowBF16ReductionCuBLAS()) {
-    cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+  auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
+  TORCH_CHECK(bf16_reduction !=
+      at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
+        "torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
+        "..., allow_splitk=False) requires the cuBLASLt backend");
+  if (bf16_reduction !=
+      at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+    cublas_flags = static_cast<cublasMath_t>(
+        cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
  }
 #endif
 #if defined(USE_ROCM)
@ -1270,7 +1300,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
  }
 #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-    if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
+    if (at::detail::getCUDAHooks().isGPUArch({"gfx11", "gfx12"})) { //no CK GEMM version
      gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
    } else{
      at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
@ -1577,18 +1607,34 @@ bool gemm_and_bias(
    abType = CUDA_R_16F;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
 #ifndef USE_ROCM
-    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
-      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
-        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
+    if (fp16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      uint32_t mask =
+          fp16_reduction ==
+                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
+              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
+                 CUBLASLT_REDUCTION_SCHEME_NONE)
+              : CUBLASLT_REDUCTION_SCHEME_NONE;
+      preference.setAttribute(
+          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
    }
 #endif
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
    abType = CUDA_R_16BF;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
 #ifndef USE_ROCM
-    if (!at::globalContext().allowBF16ReductionCuBLAS()) {
-      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
-        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
+    if (bf16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      uint32_t mask =
+          bf16_reduction ==
+                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
+              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
+                 CUBLASLT_REDUCTION_SCHEME_NONE)
+              : CUBLASLT_REDUCTION_SCHEME_NONE;
+      preference.setAttribute(
+          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
    }
 #endif
  }
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -326,6 +326,23 @@ bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
 #endif
 }

+bool CUDAHooks::supportsBFloat16RNNWithCuDNN() const {
+#if AT_CUDNN_ENABLED() && (CUDNN_VERSION >= 91300)
+  if (!hasCUDA()) {
+    return false;
+  }
+  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+  // Check for Volta cores
+  if (prop->major >= 8) {
+    return true;
+  } else {
+    return false;
+  }
+#else
+  return false;
+#endif
+}
+
 long CUDAHooks::versionCuDNN() const {
 #if AT_CUDNN_ENABLED()
  return CUDNN_VERSION;
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -45,6 +45,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  bool supportsDilatedConvolutionWithCuDNN() const override;
  bool supportsDepthwiseConvolutionWithCuDNN() const override;
  bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
+  bool supportsBFloat16RNNWithCuDNN() const override;
  bool hasCUDART() const override;
  long versionCUDART() const override;
  long versionCuDNN() const override;
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -166,6 +166,10 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    return false;
  }

+  virtual bool supportsBFloat16RNNWithCuDNN() const {
+    return false;
+  }
+
  virtual long versionCuDNN() const {
    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
  }
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@ -465,11 +465,11 @@ static void dynamicLayerBack(const c10::OperatorHandle& op, torch::jit::Stack* s

 // used for functions that have aliasing operations but should be treated like they're out of place (i.e. lift_fresh)
 static void dynamicLayerBackGradSpecialCase(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  return dynamicLayerBack(op, stack, true);
+  dynamicLayerBack(op, stack, true);
 }

 static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  return dynamicLayerBack(op, stack, false);
+  dynamicLayerBack(op, stack, false);
 }

 TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerFrontMode, m) {
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@ -12,7 +12,7 @@

 #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
 #define MPS_ERROR_RUNTIME_TOO_LOW \
-  "The MPS backend is supported on MacOS 13.0+.", \
+  "The MPS backend is supported on MacOS 14.0+. ", \
  "Current OS version can be queried using `sw_vers`"
 #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
  "as the MPS framework doesn't support float64. Please use float32 instead."
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -375,7 +375,7 @@ static void bf16_gemv_trans(
  const at::BFloat16 beta,
  at::BFloat16* y,
  const int incy) {
-  return bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
+  bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }

 template <>
--- a/aten/src/ATen/native/BucketizationUtils.h
+++ b/aten/src/ATen/native/BucketizationUtils.h
@ -70,7 +70,7 @@ inline void searchsorted_maybe_trim_input_tensors(
    const Tensor& raw_boundaries) {
  Tensor trimmed_sorter;
  Tensor raw_sorter;
-  return searchsorted_maybe_trim_input_tensors(
+  searchsorted_maybe_trim_input_tensors(
      trimmed_input,
      trimmed_boundaries,
      trimmed_sorter,
--- a/aten/src/ATen/native/GridSamplerUtils.h
+++ b/aten/src/ATen/native/GridSamplerUtils.h
@ -93,6 +93,12 @@ inline bool cond_cudnn_grid_sampler(
  const TensorBase& input,
  const TensorBase& grid
 ) {
+  auto st = input.scalar_type();
+  if (!(st == kDouble || st == kFloat || st == kHalf))
+    return false;
+  st = grid.scalar_type();
+  if (!(st == kDouble || st == kFloat || st == kHalf))
+    return false;
  return (
    at::native::cudnn_is_acceptable(input) &&
    at::native::cudnn_is_acceptable(grid) &&
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -108,6 +108,13 @@ bool use_mkldnn(const Tensor& input, TensorList params, TensorList hx) {
  return false;
 }

+bool use_cudnn(const Tensor& t) {
+  bool acceptable = at::cudnn_is_acceptable(t);
+  auto st = t.scalar_type();
+  bool bfloat16_cond = st == kBFloat16 && at::detail::getCUDAHooks().supportsBFloat16RNNWithCuDNN();
+  return acceptable && (bfloat16_cond || st == kDouble || st == kFloat || st == kHalf);
+}
+
 template<typename T>
 using pair_of = std::pair<T, T>;

@ -1200,7 +1207,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
      bool train,                                                           \
      bool bidirectional,                                                   \
      bool batch_first) {                                                   \
-    if (at::cudnn_is_acceptable(_input)) {                                  \
+    if (use_cudnn(_input)) {                                                \
      Tensor output, hy;                                                    \
      NAME##_cudnn_stub(                                                    \
          _input.device().type(),                                           \
@ -1262,7 +1269,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
      double dropout_p,                                                     \
      bool train,                                                           \
      bool bidirectional) {                                                 \
-    if (at::cudnn_is_acceptable(data)) {                                    \
+    if (use_cudnn(data)) {                                                  \
      Tensor output, hy;                                                    \
      NAME##_packed_cudnn_stub(                                             \
          data.device().type(),                                             \
@ -1430,7 +1437,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
      TensorList _params, bool has_biases,
      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (at::cudnn_is_acceptable(_input)) {
+  if (use_cudnn(_input)) {
    Tensor output, hy, cy;
    lstm_cudnn_stub(_input.device().type(), output, hy, cy, _input, hx, _params, has_biases,
            num_layers, dropout_p, train, bidirectional, batch_first);
@ -1491,7 +1498,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
      TensorList _params, bool has_biases,
      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (at::cudnn_is_acceptable(data)) {
+  if (use_cudnn(data)) {
    Tensor output, hy, cy;
    lstm_packed_cudnn_stub(data.device().type(), output, hy, cy, data, batch_sizes, hx,
            _params, has_biases, num_layers, dropout_p, train, bidirectional);
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@ -23,14 +23,6 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
-#include <ATen/ops/_cast_Byte_native.h>
-#include <ATen/ops/_cast_Char_native.h>
-#include <ATen/ops/_cast_Double_native.h>
-#include <ATen/ops/_cast_Float_native.h>
-#include <ATen/ops/_cast_Half_native.h>
-#include <ATen/ops/_cast_Int_native.h>
-#include <ATen/ops/_cast_Long_native.h>
-#include <ATen/ops/_cast_Short_native.h>
 #include <ATen/ops/_dim_arange_native.h>
 #include <ATen/ops/_efficientzerotensor_native.h>
 #include <ATen/ops/_empty_affine_quantized.h>
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@ -91,9 +91,6 @@ bool cudnn_is_acceptable(const TensorBase& self) {
    return false;
  if (!self.is_cuda())
    return false;
-  auto st = self.scalar_type();
-  if (!(st == kDouble || st == kFloat || st == kHalf))
-    return false;
  if (!detail::getCUDAHooks().compiledWithCuDNN())
    return false;
  // cuDNN functions like grid_sampler returns CUDNN_STATUS_BAD_PARAM on empty
--- a/aten/src/ATen/native/VariableMethodStubs.cpp
+++ b/aten/src/ATen/native/VariableMethodStubs.cpp
@ -25,11 +25,11 @@
 namespace at::native {

 void _backward(const Tensor& self, TensorList inputs, const std::optional<Tensor>& gradient_opt, std::optional<bool> keep_graph, bool create_graph) {
-  return self._backward(inputs, gradient_opt, keep_graph, create_graph);
+  self._backward(inputs, gradient_opt, keep_graph, create_graph);
 }

 void set_data(Tensor& self, const Tensor& new_data) {
-  return self.set_data(new_data);
+  self.set_data(new_data);
 }

 Tensor data(const Tensor& self) {
@ -54,7 +54,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) {
 }

 void retain_grad(Tensor& self) {
-  return self.retain_grad();
+  self.retain_grad();
 }

 bool retains_grad(const Tensor& self) {
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@ -300,7 +300,8 @@ void div_floor_kernel(TensorIteratorBase& iter) {
    // In the special case of unsigned integer division, floor division is
    // equivalent to truncation division (since the signs of the divisor and
    // dividend are always the same)
-    return div_trunc_kernel(iter);
+    div_trunc_kernel(iter);
+    return;
  } else if (isIntegralType(dtype, /*includeBool*/ false)) {
    // There's no SIMD integer division, so don't try to vectorize it.
    AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cpu", [&]() {
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@ -749,21 +749,29 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
        // });

        if (iter_dtype == kByte) {
-          return cpu_hflip_vec<uint8_t>(iter);
+          cpu_hflip_vec<uint8_t>(iter);
+          return;
        } else if (iter_dtype == kChar) {
-          return cpu_hflip_vec<int8_t>(iter);
+          cpu_hflip_vec<int8_t>(iter);
+          return;
        } else if (iter_dtype == kInt) {
-          return cpu_hflip_vec<int32_t>(iter);
+          cpu_hflip_vec<int32_t>(iter);
+          return;
        } else if (iter_dtype == kLong) {
-          return cpu_hflip_vec<int64_t>(iter);
+          cpu_hflip_vec<int64_t>(iter);
+          return;
        } else if (iter_dtype == kShort) {
-          return cpu_hflip_vec<int16_t>(iter);
+          cpu_hflip_vec<int16_t>(iter);
+          return;
        } else if (iter_dtype == kBool) {
-          return cpu_hflip_vec<bool>(iter);
+          cpu_hflip_vec<bool>(iter);
+          return;
        } else if (iter_dtype == kFloat) {
-          return cpu_hflip_vec<float>(iter);
+          cpu_hflip_vec<float>(iter);
+          return;
        } else if (iter_dtype == kDouble) {
-          return cpu_hflip_vec<double>(iter);
+          cpu_hflip_vec<double>(iter);
+          return;
        }
      }
      // other dtypes (float16, bfloat16, complex) are handled by cpu_kernel_vec (see below)
@ -778,10 +786,12 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
          c == input_strides_2[1] &&
          c == iter.element_size(0) * iter.shape()[0]  // checks if dim=1 is contiguous as well
      ) {
-        return cpu_hflip_channels_last_vec(iter);
+        cpu_hflip_channels_last_vec(iter);
+        return;
      }
      // Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec)
-      return cpu_vflip_memcpy(iter);
+      cpu_vflip_memcpy(iter);
+      return;
    }

    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(), "flip_cpu",
--- a/aten/src/ATen/native/cpu/PowKernel.cpp
+++ b/aten/src/ATen/native/cpu/PowKernel.cpp
@ -96,11 +96,14 @@ static void pow_tensor_scalar_kernel(
      dtype == kBFloat16 || isComplexType(dtype)) {
    // Dispatch to fast specialization for sqrt, rsqrt and reciprocal
    if (exp_scalar.equal(.5)) {
-      return sqrt_kernel(iter);
+      sqrt_kernel(iter);
+      return;
    } else if (exp_scalar.equal(-0.5)) {
-      return rsqrt_kernel(iter);
+      rsqrt_kernel(iter);
+      return;
    } else if (exp_scalar.equal(-1.0)) {
-      return reciprocal_kernel(iter);
+      reciprocal_kernel(iter);
+      return;
    }
  }

--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -256,10 +256,10 @@ static void norm_kernel_tensor_iterator_impl(
  } else {
    if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) {
      // type promotion that does cast and reduction in a single kernel
-      return norm_kernel_cpu_impl<at::Half, float>(iter, val);
+      norm_kernel_cpu_impl<at::Half, float>(iter, val); return;
    } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
      // type promotion that does cast and reduction in a single kernel
-      return norm_kernel_cpu_impl<at::BFloat16, float>(iter, val);
+      norm_kernel_cpu_impl<at::BFloat16, float>(iter, val); return;
    }

    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kComplexHalf, iter.input_dtype(), "norm_cpu", [&] {
--- a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
@ -428,10 +428,11 @@ void fp16_gemv_trans(
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0);
 #if !defined(__aarch64__) || defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
  if (at::globalContext().allowFP16ReductionCPU()) {
-    return fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+    fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+    return;
  }
 #endif
-  return fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+  fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
 }

 float bf16_dot_with_fp32_arith(const at::BFloat16* vec1, const at::BFloat16* vec2, int64_t len) {
@ -465,7 +466,7 @@ void bf16_gemv_trans(
  at::BFloat16* y,
  const int incy) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0 && beta == 0.0);
-  return bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
+  bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
 }

 float fp16_dot(
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -285,8 +285,8 @@ static bool isSupportedHipLtROCmArch(int index) {
 #if ROCM_VERSION >= 60300
        "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
 #endif
-#if ROCM_VERSION >= 60500
-        "gfx950"
+#if ROCM_VERSION >= 70000
+        "gfx950", "gfx1150", "gfx1151"
 #endif
    };
    return at::detail::getCUDAHooks().isGPUArch(archs, index);
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@ -59,7 +59,7 @@ constexpr uint64_t getDefaultMaxThreadsPerBlock() {
 #ifdef USE_ROCM
 #define SKIP_SORTED_INDICES 32
 template <typename scalar_t, int SZ>
-__global__ void indexing_backward_kernel(
+__global__ void indexing_backward_kernel_many_indices(
  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
  int64_t numel, int64_t stride, int64_t stride_before, int64_t outer_dim, bool accumulate) {
  using opmath_t = at::opmath_type<scalar_t>;
@ -254,7 +254,8 @@ __global__ void indexing_backward_kernel_stride_1(
    }
  }
 }
-#else
+#endif
+
 template <typename scalar_t, int SZ>
 __global__ void indexing_backward_kernel(
  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
@ -333,6 +334,7 @@ __global__ void indexing_backward_kernel(
  }
 }

+#ifndef USE_ROCM
 template <typename scalar_t>
 __global__ void indexing_backward_kernel_stride_1(
  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
@ -780,11 +782,43 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
            kBool,
            kBFloat16);
        } else {
+#ifdef USE_ROCM
+          if (num_indices >= 200000)
+            AT_DISPATCH_V2(
+              expandedValue.scalar_type(),
+              "indexing_backward_many_indices",
+              AT_WRAP([&] {
+                indexing_backward_kernel_many_indices<scalar_t, UNROLL><<<new_grid, block, smem_dups_size, stream>>>(
+                  sorted_indices.const_data_ptr<int64_t>(),
+                  orig_indices.const_data_ptr<int64_t>(),
+                  expandedValue.const_data_ptr<scalar_t>(),
+                  src_.mutable_data_ptr<scalar_t>(),
+                  num_indices,
+                  sliceSize,
+                  strideBefore,
+                  nElemBefore,
+                  accumulate);
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
+              }),
+              AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
+              // AT_EXPAND(AT_FLOAT8_TYPES),
+              // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+              // should not be supported here, then reenable AT_FLOAT8_DTYPES
+              kFloat8_e4m3fn,
+              kFloat8_e5m2,
+              kFloat8_e4m3fnuz,
+              kFloat8_e5m2fnuz,
+              kComplexHalf,
+              kHalf,
+              kBool,
+              kBFloat16);
+          else
+#endif
          AT_DISPATCH_V2(
            expandedValue.scalar_type(),
            "indexing_backward",
            AT_WRAP([&] {
-              indexing_backward_kernel<scalar_t, UNROLL><<<KERNEL_GRID, block, KERNEL_SMEM, stream>>>(
+              indexing_backward_kernel<scalar_t, UNROLL><<<grid, block, 0, stream>>>(
                sorted_indices.const_data_ptr<int64_t>(),
                orig_indices.const_data_ptr<int64_t>(),
                expandedValue.const_data_ptr<scalar_t>(),
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@ -121,7 +121,7 @@ void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_si
    "cufft_set_plan_cache_max_size: expected 0 <= device_index < ",
    at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
    device_index);
-  return cufft_get_plan_cache(device_index).resize(max_size);
+  cufft_get_plan_cache(device_index).resize(max_size);
 }

 int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index) {
@ -137,7 +137,7 @@ void cufft_clear_plan_cache_impl(DeviceIndex device_index) {
    "cufft_clear_plan_cache: expected 0 <= device_index < ",
    at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
    device_index);
-  return cufft_get_plan_cache(device_index).clear();
+  cufft_get_plan_cache(device_index).clear();
 }

 } // namespace at::native::detail
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@ -230,7 +230,7 @@ constexpr int BLOCK_THREADS = 256;
 constexpr int RADIX_BITS = 8;
 constexpr int RADIX_DIGITS = 1 << RADIX_BITS; // 2 ^ RADIX_BITS
 constexpr int RADIX_MASK = (RADIX_DIGITS - 1);
-static_assert(RADIX_DIGITS <= BLOCK_THREADS, "radixFindKthValues kernel requires RADIX_DIGITS <= BLOCK_THREADS");
+static_assert(RADIX_DIGITS <= BLOCK_THREADS, "RADIX_DIGITS must be <= BLOCK_THREADS");
 constexpr int MIN_ITEMS_PER_THREAD = 4;
 constexpr int MAX_ITEMS_PER_THREAD = 64;

@ -242,11 +242,10 @@ __global__ void fill(T* x, T value, IndexType size) {
  }
 }

-// find the kth smallest value,
-// for largest topk, k_to_find = slice_size - k + 1
+// compute local histogram for each block
 template <typename T, typename IndexType, typename Bitwise, int Dim>
 C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
-__global__ void radixFindKthValues(
+__global__ void computeBlockDigitCounts(
    at::cuda::detail::TensorInfo<const T, IndexType> input,
    uint32_t slice_size,
    uint32_t* ks_to_find,  // size: num_slices, unused arg but for mysterious reasons perf is better when it's present
@ -321,12 +320,51 @@ __global__ void radixFindKthValues(
  }
 }

+// compute global histogram and cumsum for each row
+__global__ void computeDigitCumSum(
+  short* counts,
+  uint32_t* digit_cum_sum,
+  uint32_t blocks_per_slice) {
+  int tidx = threadIdx.x + blockIdx.x * blockDim.x;
+  int digit_idx = threadIdx.x;
+  uint32_t slice_idx = blockIdx.x;
+
+  typedef cub::BlockScan<uint32_t, RADIX_DIGITS> BlockScan;
+  __shared__ typename BlockScan::TempStorage scan_storage;
+  // accumulates counters from multiple blocks
+  uint32_t digit_count = 0;
+  if (threadIdx.x < RADIX_DIGITS) {
+    constexpr int HISTO_ACCUM_TILE = 4;
+    uint32_t rounds = blocks_per_slice / HISTO_ACCUM_TILE;
+    for (int iter = 0; iter < rounds; iter++)  {
+      int base = HISTO_ACCUM_TILE * iter;
+      #pragma unroll
+      for (int j = 0; j < HISTO_ACCUM_TILE; j++) {
+        int blk = base + j;
+        digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + digit_idx];
+      }
+    }
+    for (int blk = HISTO_ACCUM_TILE * rounds; blk < blocks_per_slice; blk++)  {
+      digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + digit_idx];
+    }
+
+  }
+  // compute the block-wide inclusive prefix sum
+  uint32_t digit_count_cumsum;
+  BlockScan(scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
+  __syncthreads();
+  if (threadIdx.x < RADIX_DIGITS) {
+    digit_cum_sum[tidx] = digit_count_cumsum;
+  }
+}
+
 // Assumption: k can not be larger than UINT32_MAX
 template <typename Bitwise, typename T>
 C10_LAUNCH_BOUNDS_1(RADIX_DIGITS)  // one thread per digit
 __global__ void computeBlockwiseWithinKCounts(
  Bitwise* desires_in,          // size: num_slices
  short* counts,             // size: num_slices * blocks_per_slice * radix_digits
+  uint32_t* digit_cum_sum,
  uint32_t* ks_to_find_in,  // size: num_slices
  uint32_t blocks_per_slice,
  int current_bit,
@ -338,7 +376,7 @@ __global__ void computeBlockwiseWithinKCounts(
  Bitwise* desires_out,
  uint32_t num_blocks
 ) {
-  // This kernel should be launched with the same number of blocks as the `radixFindKthValues` kernel.
+  // This kernel should be launched with the same number of blocks as the `computeBlockDigitCounts` kernel.
  int tidx = threadIdx.x;
  uint32_t block_idx = getLinearBlockId<uint32_t>();
  uint32_t slice_idx = block_idx / blocks_per_slice;
@ -351,36 +389,15 @@ __global__ void computeBlockwiseWithinKCounts(
  if (block_idx >= num_blocks) {
    return;
  }
-  typedef cub::BlockScan<uint32_t, BLOCK_THREADS> BlockScan;
-  union __align__(16) TempStorage {
-    uint32_t digit_count_cumsum[RADIX_DIGITS]; // only used if this it the last block for this slice
-    typename BlockScan::TempStorage scan_storage;
-  };
-  __shared__ TempStorage temp_storage;

-  // accumulates counters from multiple blocks
-  uint32_t digit_count = 0;
-  if (tidx < RADIX_DIGITS) {
-    for (int blk = 0; blk < blocks_per_slice; ++blk) {
-      digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + tidx];
-    }
-  }
-
-  // compute the block-wide inclusive prefix sum
-  uint32_t digit_count_cumsum;
-  BlockScan(temp_storage.scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
-  __syncthreads();
-  // every thread also need the perfix_sum of it's left value for comparison, so save a copy in shared mem
-  if (tidx < RADIX_DIGITS) {
-    temp_storage.digit_count_cumsum[tidx] = digit_count_cumsum;
-  }
-  __syncthreads();

  __shared__ Bitwise desired;
  uint32_t k_to_find = ks_to_find_in[slice_idx];

  if (tidx < RADIX_DIGITS) {
-    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : temp_storage.digit_count_cumsum[tidx - 1];
+    uint32_t position = slice_idx * RADIX_DIGITS + tidx;
+    uint32_t digit_count_cumsum = digit_cum_sum[position];
+    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : digit_cum_sum[position - 1];

    // if not the last pass: update desired and ks_to_find
    // if last pass: write out the kth value
@ -466,7 +483,7 @@ template <typename Bitwise>
 __global__ void computeBlockwiseKthCounts(
  Bitwise* desires,            // size: num_slices
  short* counts,               // size: num_slices * blocks_per_slice * radix_digits
-  uint32_t num_blocks,         // the number of blocks used by `radixFindKthValues` kernel
+  uint32_t num_blocks,         // the number of blocks used by `computeBlockDigitCounts` kernel
  uint32_t blocks_per_slice,
  // outputs:
  uint32_t* kthCounts          // size: num_slices * blocks_per_slice == num_blocks
@ -649,9 +666,7 @@ void launch(
  T* kthValues = reinterpret_cast<T*>(kthValues_buffer.get());

  TORCH_CHECK(blocks_per_slice <= std::numeric_limits<uint32_t>::max(), "blocks_per_slice larger than uint32 maximum is not supported");
-  auto semaphores_buffer = allocator.allocate(numInputSlices * sizeof(uint32_t));
-  uint32_t* semaphores = reinterpret_cast<uint32_t*>(semaphores_buffer.get());
-  AT_CUDA_CHECK(cudaMemsetAsync(semaphores, 0, numInputSlices * sizeof(uint32_t), stream));
+

  auto ks_to_find_buffer = allocator.allocate(2 * numInputSlices * sizeof(uint32_t));
  uint32_t* ks_to_find = reinterpret_cast<uint32_t*>(ks_to_find_buffer.get());
@ -668,6 +683,10 @@ void launch(
  static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits<short>::max(),
    "blockwise counter too large");

+  auto digit_cum_sum_buffer = allocator.allocate(numInputSlices * RADIX_DIGITS * sizeof(uint32_t));
+  uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get());
+  AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream));
+
 #if CUB_SUPPORTS_SCAN_BY_KEY()
  auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
  uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
@ -691,7 +710,7 @@ void launch(

  // iterate radix bits for multiple passes
  for (int current_bit = sizeof(T) * 8 - RADIX_BITS; current_bit >= 0; current_bit -= RADIX_BITS) {
-    radixFindKthValues<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
+    computeBlockDigitCounts<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
        input,
        inputSliceSize,
        ks_to_find_in, // unused arg
@ -704,10 +723,14 @@ void launch(
        desired_in,
        counts);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+    computeDigitCumSum<<<numInputSlices, RADIX_DIGITS, 0, stream>>>(counts, digit_cum_sum, blocks_per_slice);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
    // we unconditionally call this kernel to update desired/ks_to_find/kthValues
    // if cub supports scan_by_key we additionally do k counts
    computeBlockwiseWithinKCounts<Bitwise, T><<<grid, RADIX_DIGITS, 0, stream>>>(
-      desired_in, counts, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
+      desired_in, counts, digit_cum_sum, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    // swap desired/ks_to_find in and out for next iter
    auto tmp_desired = desired_in;
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@ -1107,10 +1107,14 @@ void ldl_factor_kernel(
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Cusolver:
-      return ldl_factor_cusolver(
+       { ldl_factor_cusolver(
          LD, pivots, info, upper, hermitian);
+        return;
+}
    case at::LinalgBackend::Magma:
-      return ldl_factor_magma(LD, pivots, info, upper, hermitian);
+       { ldl_factor_magma(LD, pivots, info, upper, hermitian);
+        return;
+}
    default:
    // By default use cusolver if available and magma otherwise.
    // If cusolver and magma 2.5.4+ are both available and hermitian=true,
@ -1122,8 +1126,10 @@ void ldl_factor_kernel(
            LD, pivots, info, upper, hermitian);
      }
 #endif
-      return ldl_factor_cusolver(
-          LD, pivots, info, upper, hermitian);
+    { ldl_factor_cusolver(
+      LD, pivots, info, upper, hermitian);
+      return;
+    }
 #else
      return ldl_factor_magma(LD, pivots, info, upper, hermitian);
 #endif
@ -1839,11 +1845,14 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
      // For the benchmarks see
      // https://github.com/pytorch/pytorch/pull/56253#discussion_r622851107
      if (input.size(-2) <= 256 && batchCount(input) >= std::max<int64_t>(2, input.size(-2) / 16)) {
-        return geqrf_batched_cublas(input, tau);
+        geqrf_batched_cublas(input, tau);
+        return;
      } else {
-        return geqrf_cusolver(input, tau);
+        geqrf_cusolver(input, tau);
+        return;
      }
-      return geqrf_batched_cublas(input, tau);
+      geqrf_batched_cublas(input, tau);
+      return;
  };

  auto preferred_backend = at::globalContext().linalgPreferredBackend();
@ -1856,10 +1865,14 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
  // - ?geqrf_gpu allows fast computation of Q via ?orgqr_gpu, but doesn't give R properly.
  // - ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orgqr_gpu
    case at::LinalgBackend::Magma:
-      return geqrf_magma(input, tau);
+      { geqrf_magma(input, tau);
+        return;
+      }
    case at::LinalgBackend::Cusolver:
    default:
-      return geqrf_cusolver_backend(input, tau);
+      { geqrf_cusolver_backend(input, tau);
+        return;
+      }
  }
 #else
  return geqrf_magma(input, tau);
@ -2703,13 +2716,17 @@ void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Magma:
-      return gels_magma(a, b, infos);
+      { gels_magma(a, b, infos);
+        return;
+      }
    case at::LinalgBackend::Cusolver:
    default:
      // linalg_lstsq_gels is a generic function that is implemented using
      // geqrf_stub, ormqr_stub, and triangular_solve_stub
      // It dispatches to cuSOLVER for CUDA inputs if USE_LINALG_SOLVER is defined
-      return linalg_lstsq_gels(a, b, infos);
+      { linalg_lstsq_gels(a, b, infos);
+        return;
+      }
  }
 #else
  return gels_magma(a, b, infos);
--- a/aten/src/ATen/native/cudnn/Conv_v8.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@ -337,8 +337,7 @@ struct BenchmarkCache {
            engine_cache_order.begin(), engine_cache_order, it->second.second);
      }
    } else {
-      engine_cache.erase(key);
-      engine_cache.emplace(
+      engine_cache.insert_or_assign(
          key,
          std::make_pair(results, engine_cache_order.end())); // dummy iterator
    }
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -371,8 +371,7 @@ struct MHAGraphCache {
  }

  void update(const KeyType& key, T& results) {
-    engine_cache.erase(key);
-    engine_cache.emplace(key, std::move(results));
+    engine_cache.insert_or_assign(key, std::move(results));
  }
 };

--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@ -1222,7 +1222,7 @@ cudnnRNNAlgo_t get_algo(
 }

 cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
-  if (dtype == CUDNN_DATA_HALF) {
+  if (dtype == CUDNN_DATA_HALF || dtype == CUDNN_DATA_BFLOAT16) {
    return CUDNN_DATA_FLOAT;
  }
  return dtype;
--- a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
@ -772,13 +772,21 @@ void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {

 template <>
 void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  std::string_view arch(dprops->gcnArchName);
-  if (arch == "gfx1100") {
+  static const std::vector<std::string> wmma_archs = {
+    "gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
+#if ROCM_VERSION >= 70000
+    "gfx1150", "gfx1151"
+#endif
+  };
+  if (at::detail::getCUDAHooks().isGPUArch(wmma_archs)) {
    dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGS(at::BFloat16));
-  } else{
+  }
+  else if (at::detail::getCUDAHooks().isGPUArch({"gfx9"})) {
    dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
  }
+  else {
+    TORCH_CHECK(false, "gemm_internal_ck<at::BFloat16> unsupported gfx arch");
+  }
 }

 } // namespace at::native
--- a/aten/src/ATen/native/hip/ck_gemm_half.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_half.hip
@ -599,11 +599,21 @@ void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {

 template <>
 void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
-  if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) {
+  static const std::vector<std::string> wmma_archs = {
+    "gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
+#if ROCM_VERSION >= 70000
+    "gfx1150", "gfx1151"
+#endif
+  };
+  if (at::detail::getCUDAHooks().isGPUArch(wmma_archs)) {
    dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGS(at::Half));
-  } else{
+  }
+  else if (at::detail::getCUDAHooks().isGPUArch({"gfx9"})) {
    dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
  }
+  else {
+    TORCH_CHECK(false, "gemm_internal_ck<at::Half> unsupported gfx arch");
+  }
 }

 } // namespace at::native
--- a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
@ -373,59 +373,67 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return addmm_dense_result(
+        addmm_dense_result(
            mat2.transpose(0, 1).to_sparse_csr(),
            mat1.transpose(0, 1),
            beta,
            alpha,
            result.transpose(0, 1));
+            return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(
+        addmm_dense_result(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
            result.transpose(-2, -1));
+            return;
      }
    }
    if (mat2.layout() == kSparseBsc) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(
+        addmm_dense_result(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
            result.transpose(-2, -1));
+            return;
      }
    }
  }
  if (mat1.layout() == kSparseCsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_dense_result(mat1, mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
-        return addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
+        return;
      }
      if (result.layout() == kSparseCsr) {
-        return addmm_sparse_result(mat1, mat2, beta, alpha, result);
+        addmm_sparse_result(mat1, mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
        // TODO: CSR @ CSC kernel would be very fast due to format alignment
-        return addmm_sparse_input_dense_result(
-            mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_input_dense_result(
+          mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        return;
      }
      if (result.layout() == kSparseCsr) {
        // TODO: CSR @ CSC kernel would be very fast due to format alignment
-        return addmm_sparse_result(
-            mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_result(
+          mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        return;
      }
    }
  }
@ -433,56 +441,62 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
        // TODO: avoid csc->csr conversion with native csc support
-        return addmm_dense_result(
-            mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        addmm_dense_result(
+          mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr) {
        // TODO: avoid csc->csr conversion with native csc support
-        return addmm_sparse_result(
-            mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        addmm_sparse_result(
+          mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        return addmm_sparse_input_dense_result(
-            mat2.transpose(-2, -1),
-            mat1.transpose(-2, -1),
-            beta,
-            alpha,
-            result.transpose(-2, -1));
+        addmm_sparse_input_dense_result(
+          mat2.transpose(-2, -1),
+          mat1.transpose(-2, -1),
+          beta,
+          alpha,
+          result.transpose(-2, -1));
+        return;
      }
      if (result.layout() == kSparseCsr) {
        // TODO avoid csc->csr
-        return addmm_sparse_result(
-            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_result(
+          mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        return;
      }
      if (result.layout() == kSparseCsc) {
-        return addmm_sparse_result(
-            mat2.transpose(-2, -1),
-            mat1.transpose(-2, -1),
-            beta,
-            alpha,
-            result.transpose(-2, -1));
+        addmm_sparse_result(
+          mat2.transpose(-2, -1),
+          mat1.transpose(-2, -1),
+          beta,
+          alpha,
+          result.transpose(-2, -1));
+        return;
      }
    }
  }
  if (mat1.layout() == kSparseBsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_dense_result(mat1, mat2, beta, alpha, result);
+        return;
      }
    }
  }
  TORCH_CHECK(
-      false,
-      "addmm: computation on CPU is not implemented for ",
-      result.layout(),
-      " + ",
-      mat1.layout(),
-      " @ ",
-      mat2.layout());
+    false,
+    "addmm: computation on CPU is not implemented for ",
+    result.layout(),
+    " + ",
+    mat1.layout(),
+    " @ ",
+    mat2.layout());
 }

 /*
@ -496,16 +510,16 @@ void addmm_out_sparse_csr(
               [out] result of the operation.
 */
 void addmv_out_sparse_csr(
-    const Tensor& mat,
-    const Tensor& vec,
-    const Scalar& beta,
-    const Scalar& alpha,
-    const Tensor& result) {
+  const Tensor& mat,
+  const Tensor& vec,
+  const Scalar& beta,
+  const Scalar& alpha,
+  const Tensor& result) {
 #if !AT_USE_MKL_SPARSE()
  TORCH_CHECK(
-      false,
-      "Calling addmv on a sparse CPU tensor requires Linux platform. ",
-      "Please use PyTorch built with MKL on Linux.");
+    false,
+    "Calling addmv on a sparse CPU tensor requires Linux platform. ",
+    "Please use PyTorch built with MKL on Linux.");
 #else
  c10::MaybeOwned<Tensor> result_ = prepare_dense_vector_for_mkl(result);
  c10::MaybeOwned<Tensor> vec_ = prepare_dense_vector_for_mkl(vec);
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -5,38 +5,6 @@
 # representing ScalarType's. They are now superseded by usage of
 # `aten::to()`. The ops remain here for backward compatibility purposes.

-# DEPRECATED. DO NOT USE
- func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
 # Computes the gradient of current tensor w.r.t. graph leaves.
 - func: _backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()
  manual_cpp_binding: True
@ -6725,12 +6693,12 @@

 - func: native_norm(Tensor self, Scalar p=2) -> Tensor
  dispatch:
-    SparseCPU, SparseCUDA: norm_sparse
+    SparseCPU, SparseCUDA, SparseMPS: norm_sparse
  autogen: native_norm.out

 - func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor
  dispatch:
-    SparseCPU, SparseCUDA: norm_sparse
+    SparseCPU, SparseCUDA, SparseMPS: norm_sparse
  autogen: native_norm.ScalarOpt_dim_dtype_out

 - func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
@ -6856,14 +6824,14 @@
  device_check: NoCheck   # TensorIterator
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA: sparse_dtype_norm
+    SparseCPU, SparseCUDA, SparseMPS: sparse_dtype_norm

 - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
  structured_delegate: norm.out
  device_check: NoCheck   # TensorIterator
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA: sparse_norm
+    SparseCPU, SparseCUDA, SparseMPS: sparse_norm

 - func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
  structured: True
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
@ -810,7 +810,8 @@ void addmm_out_sparse_csr(
  if (mat1.layout() == kSparseBsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided)
-        return block_sparse_mm(input, mat1, mat2, beta, alpha, result);
+         { block_sparse_mm(input, mat1, mat2, beta, alpha, result); return;
+}
    }
  }

@ -819,13 +820,13 @@ void addmm_out_sparse_csr(
      if (result.layout() == kStrided) {
        auto result_t = result.transpose(-2, -1);
        auto input_t = (result.is_same(input) ? result_t : input.transpose(-2, -1));
-        return block_sparse_mm(
+        block_sparse_mm(
            input_t,
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result_t);
+            result_t); return;
      }
    }
  }
@ -840,41 +841,41 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spmm(
+        spmm(
            mat2.transpose(0, 1).to_sparse_csr(),
            mat1.transpose(0, 1),
            beta,
            alpha,
-            result.transpose(0, 1));
+            result.transpose(0, 1)); return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        return spmm(
+        spmm(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result.transpose(-2, -1));
+            result.transpose(-2, -1)); return;
      }
    }
  }
  if (mat1.layout() == kSparseCsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        return spmm(mat1, mat2, beta, alpha, result);
+        spmm(mat1, mat2, beta, alpha, result); return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr) {
-        return spgemm(mat1, mat2, beta, alpha, result);
+        spgemm(mat1, mat2, beta, alpha, result); return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kSparseCsr) {
        // TODO: Add native CSC support via cuSPARSE if supported.
        // CSR @ CSC kernel would be very fast due to format alignment
-        return spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result); return;
      }
    }
  }
@ -882,27 +883,28 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr)
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
+         { spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
+}
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kSparseCsr) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spgemm(
-            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        spgemm(
+            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result); return;
      }
      if (result.layout() == kSparseCsc) {
-        return spgemm(
+        spgemm(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result.transpose(-2, -1));
+            result.transpose(-2, -1)); return;
      }
    }
  }
@ -933,7 +935,7 @@ void addmv_out_sparse_csr(
    const Scalar& alpha,
    const Tensor& result) {
  if (mat.layout() == kSparseBsr) {
-    return block_sparse_mv(mat, vec, beta, alpha, result);
+    block_sparse_mv(mat, vec, beta, alpha, result); return;
  }
  cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;

@ -1213,9 +1215,9 @@ void triangular_solve_out_sparse_csr(
  }
  if (A.layout() == kSparseBsr) {
    if (B.size(-1) == 1) {
-      return block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular);
+      block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular); return;
    } else {
-      return block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular);
+      block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular); return;
    }
  }
 #ifdef USE_ROCM
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
@ -117,7 +117,7 @@ class FwdKernel:
    def get_all(cls) -> list["FwdKernel"]:
        kernels: list[FwdKernel] = []
        for aligned, dtype, (sm, sm_max) in itertools.product(
-            [True, False], DTYPES.keys(), zip(SM, SM[1:])
+            [True, False], DTYPES.keys(), itertools.pairwise(SM)
        ):
            # Remove some kernels we don't use
            if dtype == "bf16" and sm < 80:
@ -228,7 +228,7 @@ class BwdKernel:
        for aligned, dtype, (sm, sm_max), apply_dropout, max_k in itertools.product(
            [True, False],
            DTYPES.keys(),
-            zip(SM, SM[1:]),
+            itertools.pairwise(SM),
            [True, False],
            [32, 64, 128, 2**16],
        ):
--- a/benchmarks/distributed/bench_nvshmem_tile_reduce.py
+++ b/benchmarks/distributed/bench_nvshmem_tile_reduce.py
@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""
+Benchmark for NVSHMEM tile reduce operations.
+
+Usage:
+python benchmarks/distributed/bench_nvshmem_tile_reduce.py
+
+This benchmark measures the performance of tile reduce operations across different
+matrix sizes and tile configurations.
+"""
+
+import time
+
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+from torch.testing._internal.common_distributed import MultiProcContinuousTest
+from torch.testing._internal.common_utils import (
+    requires_cuda_p2p_access,
+    skip_but_pass_in_sandcastle_if,
+    skipIfRocm,
+)
+
+
+# Decorator
+def requires_nvshmem():
+    return skip_but_pass_in_sandcastle_if(
+        not symm_mem.is_nvshmem_available(),
+        "bench_nvshmem_tile_reduce requires NVSHMEM, skipping benchmark",
+    )
+
+
+# So that benchmarks are written in device-agnostic way
+device_type = "cuda"
+device_module = torch.get_device_module(device_type)
+
+
+@requires_nvshmem()
+@requires_cuda_p2p_access()
+class NVSHMEMTileReduceBenchmark(MultiProcContinuousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+        # Set NVSHMEM as SymmMem backend
+        symm_mem.set_backend("NVSHMEM")
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    def _benchmark_tile_reduce_single(
+        self,
+        full_size: int,
+        tile_size: int,
+        warmup_iters: int = 5,
+        bench_iters: int = 10,
+    ) -> dict:
+        """
+        Benchmark a single configuration of tile reduce.
+
+        Args:
+            full_size: Size of the full matrix (full_size x full_size)
+            warmup_iters: Number of warmup iterations
+            bench_iters: Number of benchmark iterations
+
+        Returns:
+            Dictionary with benchmark results
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+
+        # Allocate full matrices
+        full_inp = symm_mem.empty(
+            full_size, full_size, dtype=dtype, device=self.device
+        ).fill_(self.rank)
+        full_out = symm_mem.empty(
+            full_size, full_size, dtype=dtype, device=self.device
+        ).fill_(0)
+
+        slice_ut = slice(0, tile_size)
+        inp_tile = full_inp[slice_ut, slice_ut]
+        out_tile = full_out[slice_ut, slice_ut]
+
+        root = 0
+
+        # Warmup iterations
+        for _ in range(warmup_iters):
+            torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
+            torch.cuda.synchronize(self.device)
+
+        # Benchmark iterations
+        times = []
+
+        dist.barrier()
+        torch.cuda.synchronize(self.device)
+        start_time = time.perf_counter()
+
+        for _ in range(bench_iters):
+            torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
+
+        torch.cuda.synchronize(self.device)
+        end_time = time.perf_counter()
+        times.append((end_time - start_time) / bench_iters)
+
+        # Calculate statistics
+        times = torch.tensor(times, dtype=torch.float64)
+        tile_elements = tile_size * tile_size
+        tile_bytes = (
+            tile_elements * dtype.itemsize
+            if hasattr(dtype, "itemsize")
+            else tile_elements * 4
+        )
+
+        results = {
+            "full_size": full_size,
+            "tile_size": tile_size,
+            "tile_elements": tile_elements,
+            "tile_bytes": tile_bytes,
+            "world_size": self.world_size,
+            "mean_time_ms": times.mean().item() * 1000,
+            "std_time_ms": times.std().item() * 1000,
+            "min_time_ms": times.min().item() * 1000,
+            "max_time_ms": times.max().item() * 1000,
+            "throughput_gb_s": tile_bytes / (times.mean().item() * 1e9),
+            "elements_per_sec": tile_elements / times.mean().item(),
+        }
+
+        return results
+
+    @skipIfRocm
+    def test_benchmark_tile_reduce_various_sizes(self) -> None:
+        """
+        Benchmark tile reduce across various matrix sizes.
+        """
+        # Test various matrix sizes
+        tile_sizes = [512, 1024, 2048, 4096, 8192, 16384]
+        full_size = tile_sizes[-1]
+        warmup_iters = 5
+        bench_iters = 20
+
+        results = []
+
+        for tile_size in tile_sizes:
+            try:
+                result = self._benchmark_tile_reduce_single(
+                    full_size, tile_size, warmup_iters, bench_iters
+                )
+                results.append(result)
+
+                if self.rank == 0:
+                    print(
+                        f"Matrix Size: {full_size}x{full_size}, Tile Size: {tile_size}x{tile_size}"
+                    )
+                    print(
+                        f"  Mean Time: {result['mean_time_ms']:.3f} ± {result['std_time_ms']:.3f} ms"
+                    )
+                    print(f"  Throughput: {result['throughput_gb_s']:.2f} GB/s")
+                    print(f"  Bytes: {result['tile_bytes']:.0f}")
+                    print()
+
+            except Exception as e:
+                if self.rank == 0:
+                    print(f"Failed to benchmark matrix size {full_size}: {e}")
+
+        # Print summary
+        if self.rank == 0 and results:
+            print("=== BENCHMARK SUMMARY ===")
+            print(
+                f"{'Matrix Size':<12} {'Tile Size':<10} {'Time (ms)':<12} {'Throughput (GB/s)':<18} {'Bytes':<15}"
+            )
+            print("-" * 70)
+
+            for result in results:
+                print(
+                    f"{result['full_size']}x{result['full_size']:<7} "
+                    f"{result['tile_size']}x{result['tile_size']:<5} "
+                    f"{result['mean_time_ms']:<12.3f} "
+                    f"{result['throughput_gb_s']:<18.2f} "
+                    f"{result['tile_bytes']:<15.0f}"
+                )
+
+
+if __name__ == "__main__":
+    # For standalone usage, you'd need to set up distributed environment
+    # For now, this is meant to be run via the PyTorch test framework
+    from torch.testing._internal.common_utils import run_tests
+
+    run_tests()
--- a/benchmarks/dynamo/all_torchbench_models_list.txt
+++ b/benchmarks/dynamo/all_torchbench_models_list.txt
@ -25,15 +25,6 @@ drq
 fambench_dlrm
 fambench_xlmr
 fastNLP_Bert
-hf_Albert
-hf_Bart
-hf_Bert
-hf_BigBird
-hf_DistilBert
-hf_GPT2
-hf_Longformer
-hf_Reformer
-hf_T5
 maml
 maml_omniglot
 mnasnet1_0
@ -60,13 +51,6 @@ soft_actor_critic
 speech_transformer
 squeezenet1_1
 tacotron2
-timm_efficientdet
-timm_efficientnet
-timm_nfnet
-timm_regnet
-timm_resnest
-timm_vision_transformer
-timm_vovnet
 tts_angular
 vgg16
 vision_maskrcnn
--- a/benchmarks/dynamo/cachebench.py
+++ b/benchmarks/dynamo/cachebench.py
@ -23,7 +23,6 @@ TORCHBENCH_MODELS: list[str] = [
    "resnet50",
    "moco",
    "llama",
-    "hf_T5",
 ]
 HUGGINGFACE_MODELS: list[str] = [
    "AllenaiLongformerBase",
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@ -10,9 +10,7 @@ import pandas as pd

 flaky_models = {
    "yolov3",
-    "gluon_inception_v3",
    "detectron2_maskrcnn_r_101_c4",
-    "timm_efficientnet",  # see https://github.com/pytorch/pytorch/issues/148699
    "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
    "moondream",  # discovered in https://github.com/pytorch/pytorch/pull/159291
    # discovered in https://github.com/pytorch/pytorch/issues/161419. Its not flaky but really hard to repro, so skipping it
@ -36,8 +34,6 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
            {
                "Background_Matting",
                "alexnet",
-                "cait_m36_384",
-                "dla102",
                "demucs",
                "densenet121",
                "detectron2_fcos_r_50_fpn",
@ -45,13 +41,6 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "doctr_reco_predictor",
                "dpn107",
                "fbnetv3_b",
-                "hf_BigBird",
-                "hf_Longformer",
-                "hf_Reformer",
-                "hf_Roberta_base",
-                "hf_T5",
-                "hf_T5_base",
-                "hf_T5_generate",
                "levit_128",
                "llava",
                "microbench_unbacked_tolist_sum",
@ -70,12 +59,6 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "stable_diffusion_text_encoder",
                "stable_diffusion_unet",
                "swsl_resnext101_32x16d",
-                "timm_efficientdet",
-                "timm_efficientnet",
-                "timm_nfnet",
-                "timm_regnet",
-                "timm_resnest",
-                "timm_vovnet",
                "torchrec_dlrm",
                "vgg16",
                # LLM
--- a/benchmarks/dynamo/check_graph_breaks.py
+++ b/benchmarks/dynamo/check_graph_breaks.py
@ -10,7 +10,6 @@ import pandas as pd

 flaky_models = {
    "yolov3",
-    "gluon_inception_v3",
    "detectron2_maskrcnn_r_101_c4",
    "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
    "detectron2_fcos_r_50_fpn",
@ -32,18 +31,11 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
        flaky_models.update(
            {
                "alexnet",
-                "cait_m36_384",
                "demucs",
                "densenet121",
                "detectron2_fcos_r_50_fpn",
                "doctr_det_predictor",
                "doctr_reco_predictor",
-                "hf_BigBird",
-                "hf_Longformer",
-                "hf_Reformer",
-                "hf_Roberta_base",
-                "hf_T5",
-                "hf_T5_base",
                "levit_128",
                "llava",
                "microbench_unbacked_tolist_sum",
@ -54,7 +46,6 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                "stable_diffusion_text_encoder",
                "stable_diffusion_unet",
                "timm_efficientdet",
-                "timm_nfnet",
                "torchrec_dlrm",
                "vgg16",
                # LLM
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,pass,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,7



-botnet26t_256,pass,6
-
-
-
-cait_m36_384,eager_fail_to_run,0
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
-
-
-
 deit_base_distilled_patch16_224,pass,7



-dla102,pass,7
-
-
-
 dm_nfnet_f0,pass,6



-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6



-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6



-jx_nest_base,pass,7
-
-
-
-lcnet_050,fail_accuracy,6
-
-
-
-levit_128,pass,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
 mobilenetv2_100,pass,7


@ -146,100 +42,16 @@ nfnet_l0,pass,7



-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
 repvgg_a2,pass,7



-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
-
-
-
 swin_base_patch4_window7_224,pass,7



-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6



-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
 visformer_small,pass,7
-
-
-
-vit_base_patch16_224,pass,7
-
-
-
-volo_d1_224,pass,7
-
-
-
-xcit_large_24_p8_224,pass_due_to_skip,7
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@ -130,70 +130,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,5
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_fail_to_run,0
-
-
-
-hf_T5_generate,pass,7
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -342,30 +278,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@ -78,62 +78,6 @@ functorch_maml_omniglot,pass,7



-hf_Albert,pass,6
-
-
-
-hf_Bart,pass,6
-
-
-
-hf_Bert,pass,6
-
-
-
-hf_Bert_large,pass,6
-
-
-
-hf_BigBird,pass,6
-
-
-
-hf_DistilBert,pass,6
-
-
-
-hf_GPT2,pass,8
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,20
-
-
-
-hf_Roberta_base,pass,6
-
-
-
-hf_T5_base,eager_2nd_run_OOM,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,6
-
-
-
-hf_distil_whisper,model_fail_to_load,0
-
-
-
 lennard_jones,pass,7


@ -250,30 +194,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientnet,fail_accuracy,7
-
-
-
-timm_regnet,pass,7
-
-
-
-timm_resnest,pass,6
-
-
-
-timm_vision_transformer,pass,6
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,6
-
-
-
 torch_multimodal_clip,pass,7


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@ -6,58 +6,26 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,0


@ -66,10 +34,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -82,10 +46,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -94,10 +54,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -106,18 +62,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -126,26 +74,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,pass,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
@ -118,62 +118,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,fail_accuracy,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_fail_to_run,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -314,30 +258,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@ -114,58 +114,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -278,38 +226,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
@ -6,58 +6,26 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,0


@ -66,10 +34,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -82,10 +46,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -94,10 +54,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -106,18 +62,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -126,26 +74,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,pass,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
@ -114,58 +114,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -278,38 +226,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,timeout,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@ -122,66 +122,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,27
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,5
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -302,38 +242,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,timeout,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@ -122,66 +122,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,27
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,5
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -302,38 +242,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,pass,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@ -122,66 +122,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,27
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,5
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -302,38 +242,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_inference.csv
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0



-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
-
-
-
 deit_base_distilled_patch16_224,pass,0



-dla102,pass,0
-
-
-
 dm_nfnet_f0,pass,0



-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0



-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0



-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0


@ -146,100 +42,16 @@ nfnet_l0,pass,0



-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0



-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0



-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
-
-
-
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
-
-
-
-xcit_large_24_p8_224,pass,0
--- a/Show More
+++ b/Show More