document some quantization public apis

2025-10-23 06:34:55 +08:00 · 2025-10-10 13:39:03 -07:00
607 changed files with 9611 additions and 7939 deletions
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -37,9 +37,9 @@ case ${DOCKER_TAG_PREFIX} in
  rocm*)
    BASE_TARGET=rocm
    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-    # add gfx950, gfx115x conditionally starting in ROCm 7.0
+    # add gfx950 conditionally starting in ROCm 7.0
    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
-        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
    fi
    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
    ;;
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -344,7 +344,7 @@ docker build \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
-       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx1100}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-e0dda9059d082537cee36be6c5e4fe3b18c880c0
+deb42f2a8e48f5032b4a98ee781a15fa87a157cf
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
+27664085f804afc83df26f740bb46c365854f2c4
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -46,9 +46,9 @@ case ${DOCKER_TAG_PREFIX} in
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
+        # add gfx950 conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -115,9 +115,6 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
 # cmake-3.28.0 from pip for onnxruntime
 RUN python3 -mpip install cmake==3.28.0

-ADD ./common/patch_libstdc.sh patch_libstdc.sh
-RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
-
 # build onnxruntime 1.21.0 from sources.
 # it is not possible to build it from sources using pip,
 # so just build it from upstream repository.
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -84,9 +84,9 @@ case ${image} in
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
+        # add gfx950 conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -120,8 +120,9 @@ ninja==1.11.1.4
 numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
-#Pinned versions: 0.55.2, 0.60.0
+#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
+#For numba issue see https://github.com/pytorch/pytorch/issues/51511
 #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073

 #numpy
@ -241,9 +242,10 @@ pygments==2.15.0
 #Pinned versions: 14.1.0
 #test that import:

-scikit-image==0.22.0
+scikit-image==0.19.3 ; python_version < "3.10"
+scikit-image==0.22.0 ; python_version >= "3.10"
 #Description: image processing routines
-#Pinned versions: 0.22.0
+#Pinned versions:
 #test that import: test_nn.py

 #scikit-learn
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -5,7 +5,7 @@ DESIRED_ROCM ?= 7.0
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
-#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201
+#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201

 DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@ -18,6 +18,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 .PHONY: all
 all: magma-rocm70
 all: magma-rocm64
+all: magma-rocm63

 .PHONY:
 clean:
@ -33,3 +34,8 @@ magma-rocm70:
 magma-rocm64: DESIRED_ROCM := 6.4
 magma-rocm64:
 	$(DOCKER_RUN)
+
+.PHONY: magma-rocm63
+magma-rocm63: DESIRED_ROCM := 6.3
+magma-rocm63:
+	$(DOCKER_RUN)
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -67,7 +67,7 @@ fi
 #       wheels with cxx11-abi

 echo "Checking that the gcc ABI is what we expect"
-if [[ "$(uname)" != 'Darwin' ]]; then
+if [[ "$(uname)" != 'Darwin' &&  "$(uname -m)" != "s390x" ]]; then
  # We also check that there are cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -886,7 +886,7 @@ test_inductor_torchbench_smoketest_perf() {
  done

  # Perform some "warm-start" runs for a few huggingface models.
-  for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
    python benchmarks/dynamo/check_accuracy.py \
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,7 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 # TODO: Move this to .ci/docker/requirements-ci.txt
-python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2"
+python -m pip install "psutil==5.9.1" "pynvml==11.4.1" "pytest-shard==0.1.2"

 run_tests() {
    # Run nvidia-smi if available
--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -18,7 +18,6 @@ class GitHubComment:
    body_text: str
    created_at: str
    author_login: str
-    author_url: Optional[str]
    author_association: str
    editor_login: Optional[str]
    database_id: int
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/test_check_labels.py
+++ b/.github/scripts/test_check_labels.py
@ -38,7 +38,6 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text="mock_body_text",
            created_at="",
            author_login="",
-            author_url=None,
            author_association="",
            editor_login=None,
            database_id=1,
@ -49,7 +48,6 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
            created_at="",
            author_login=BOT_AUTHORS[1],
-            author_url=None,
            author_association="",
            editor_login=None,
            database_id=2,
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -32,7 +32,6 @@ from trymerge import (
    main as trymerge_main,
    MandatoryChecksMissingError,
    MergeRule,
-    PostCommentError,
    RE_GHSTACK_DESC,
    read_merge_rules,
    remove_job_name_suffix,
@ -589,23 +588,6 @@ class TestTryMerge(TestCase):
            self.assertEqual(mock_merge_base, pr.get_merge_base())
            mocked_gh_fetch_merge_base.assert_called_once()

-    def test_app_can_revert(self, *args: Any) -> None:
-        pr = GitHubPR("pytorch", "pytorch", 164660)
-        repo = DummyGitRepo()
-        app_comment_id, impostor_comment_id = 3375785595, 3377647892
-        # Check that app can revert
-        self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id))
-        # But impostor can not
-        self.assertRaises(
-            PostCommentError,
-            lambda: validate_revert(repo, pr, comment_id=impostor_comment_id),
-        )
-        # Despite it's name being the name of the bot
-        self.assertEqual(
-            pr.get_comment_by_id(impostor_comment_id).author_login,
-            "pytorch-auto-revert",
-        )
-

@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -234,7 +234,6 @@ query ($owner: String!, $name: String!, $number: Int!) {
          createdAt
          author {
            login
-            url
          }
          authorAssociation
          editor {
@ -1094,7 +1093,6 @@ class GitHubPR:
            body_text=node["bodyText"],
            created_at=node["createdAt"] if "createdAt" in node else "",
            author_login=node["author"]["login"],
-            author_url=node["author"].get("url", None),
            author_association=node["authorAssociation"],
            editor_login=editor["login"] if editor else None,
            database_id=node["databaseId"],
@ -2031,11 +2029,6 @@ def validate_revert(
    # For some reason, one can not be a member of private repo, only CONTRIBUTOR
    if pr.is_base_repo_private():
        allowed_reverters.append("CONTRIBUTOR")
-    # Special case the pytorch-auto-revert app, whose does not have association
-    # But should be able to issue revert command
-    if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
-        allowed_reverters.append("NONE")
-
    if author_association not in allowed_reverters:
        raise PostCommentError(
            f"Will not revert as @{author_login} is not one of "
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@ -63,7 +63,6 @@ jobs:
      # Same as the build job
      python-version: 3.12.7
      test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
-      timeout-minutes: 300
      disable-monitor: false
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -59,29 +59,3 @@ jobs:
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
-
-  linux-jammy-rocm-py3_10-gfx1100-test:
-    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3_10-gfx1100
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
-        ]}
-      tests-to-include: >
-         test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
-         test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
-         inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
-         inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
-         inductor/test_flex_attention inductor/test_max_autotune
-    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -88,7 +88,7 @@ torch_compile_debug/
 # Listed manually because some files in this directory are not generated
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
-torch/headeronly/version.h
+torch/csrc/api/include/torch/version.h
 torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -28,7 +28,7 @@ exclude_patterns = [
    'torch/lib/**',
    'venv/**',
    '**/*.pyi',
-    "tools/experimental/torchfuzz/**",
+    "tools/experimental/dynamic_shapes/torchfuzz/**",
    'tools/test/test_selective_build.py',
 ]
 command = [
@ -198,7 +198,7 @@ exclude_patterns = [
    'tools/test/gen_operators_yaml_test.py',
    'tools/test/gen_oplist_test.py',
    'tools/test/test_selective_build.py',
-    'tools/experimental/torchfuzz/**',
+    'tools/experimental/dynamic_shapes/torchfuzz/**',
 ]
 command = [
    'python3',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -13,9 +13,6 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt
 load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
 load("//:tools/bazel.bzl", "rules")

-# Export files for use by torch/headeronly (where version.h generation now lives)
-exports_files(["version.txt"])
-
 define_targets(rules = rules)

 COMMON_COPTS = [
@ -693,9 +690,7 @@ cc_library(
            "torch/csrc/*/generated/*.h",
            "torch/csrc/jit/serialization/mobile_bytecode_generated.h",
        ] + torch_cuda_headers,
-    ) + GENERATED_AUTOGRAD_CPP + [
-        "//torch/headeronly:version_h",
-    ],
+    ) + GENERATED_AUTOGRAD_CPP + [":version_h"],
    includes = [
        "third_party/kineto/libkineto/include",
        "torch/csrc",
--- a/2
+++ b/2
@ -53,7 +53,7 @@ ARG CUDA_PATH=cu121
 ARG INSTALL_CHANNEL=whl/nightly
 # Automatically set by buildx
 # pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
-RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0
+RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=25.7.0

 ARG TARGETPLATFORM

--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -483,8 +483,8 @@ at::BlasBackend Context::blasPreferredBackend() {
 #if ROCM_VERSION >= 60300
          "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
 #endif
-#if ROCM_VERSION >= 70000
-          "gfx950", "gfx1150", "gfx1151"
+#if ROCM_VERSION >= 60500
+          "gfx950"
 #endif
      };
      for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -229,14 +229,14 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
  }

  void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef<int64_t> size) {
-    _resize_(sparse_dim, dense_dim, size);
+    return _resize_(sparse_dim, dense_dim, size);
  }

  void resize_(
      int64_t sparse_dim,
      int64_t dense_dim,
      ArrayRef<c10::SymInt> size) {
-    _resize_(sparse_dim, dense_dim, size);
+    return _resize_(sparse_dim, dense_dim, size);
  }

  // NOTE: this function will resize the sparse tensor and also set `indices`
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@ -59,7 +59,7 @@ static inline void set_item(const Tensor& self, ArrayRef<TensorIndex> indices, c
    }
  }

-  set_item(self, indices, value);
+  return set_item(self, indices, value);
 }

 } // namespace indexing
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@ -765,8 +765,7 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
  if (numel == 0) {
    return;
  } else if (numel < grain_size || at::get_num_threads() == 1) {
-    serial_for_each(loop, {0, numel});
-    return;
+    return serial_for_each(loop, {0, numel});
  } else {
    at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
      serial_for_each(loop, {begin, end});
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@ -49,7 +49,7 @@ static void check_unique_names(DimnameList names) {
 }

 void check_names_valid_for(const TensorBase& tensor, DimnameList names) {
-  impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
+  return impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
 }

 void check_names_valid_for(size_t tensor_dim, DimnameList names) {
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -138,7 +138,7 @@ void Tensor::_backward(TensorList inputs,
        const std::optional<Tensor>& gradient,
        std::optional<bool> keep_graph,
        bool create_graph) const {
-  impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
+  return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
 }

 const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -496,7 +496,7 @@ class TORCH_API OperatorHandle {
  }

  void checkInvariants() const {
-    operatorDef_->op.checkInvariants();
+    return operatorDef_->op.checkInvariants();
  }

  c10::ArrayRef<at::Tag> getTags() const {
@ -932,7 +932,7 @@ inline void Dispatcher::redispatchBoxed(
  }
 #endif
  const auto& kernel = entry.lookup(dispatchKeySet);
-  kernel.callBoxed(op, dispatchKeySet, stack);
+  return kernel.callBoxed(op, dispatchKeySet, stack);
 }

 } // namespace c10
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1270,7 +1270,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
  }
 #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-    if (at::detail::getCUDAHooks().isGPUArch({"gfx11", "gfx12"})) { //no CK GEMM version
+    if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
      gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
    } else{
      at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -326,23 +326,6 @@ bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
 #endif
 }

-bool CUDAHooks::supportsBFloat16RNNWithCuDNN() const {
-#if AT_CUDNN_ENABLED() && (CUDNN_VERSION >= 91300)
-  if (!hasCUDA()) {
-    return false;
-  }
-  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
-  // Check for Volta cores
-  if (prop->major >= 8) {
-    return true;
-  } else {
-    return false;
-  }
-#else
-  return false;
-#endif
-}
-
 long CUDAHooks::versionCuDNN() const {
 #if AT_CUDNN_ENABLED()
  return CUDNN_VERSION;
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -45,7 +45,6 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  bool supportsDilatedConvolutionWithCuDNN() const override;
  bool supportsDepthwiseConvolutionWithCuDNN() const override;
  bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
-  bool supportsBFloat16RNNWithCuDNN() const override;
  bool hasCUDART() const override;
  long versionCUDART() const override;
  long versionCuDNN() const override;
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -166,10 +166,6 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    return false;
  }

-  virtual bool supportsBFloat16RNNWithCuDNN() const {
-    return false;
-  }
-
  virtual long versionCuDNN() const {
    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
  }
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@ -465,11 +465,11 @@ static void dynamicLayerBack(const c10::OperatorHandle& op, torch::jit::Stack* s

 // used for functions that have aliasing operations but should be treated like they're out of place (i.e. lift_fresh)
 static void dynamicLayerBackGradSpecialCase(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  dynamicLayerBack(op, stack, true);
+  return dynamicLayerBack(op, stack, true);
 }

 static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  dynamicLayerBack(op, stack, false);
+  return dynamicLayerBack(op, stack, false);
 }

 TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerFrontMode, m) {
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@ -12,7 +12,7 @@

 #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
 #define MPS_ERROR_RUNTIME_TOO_LOW \
-  "The MPS backend is supported on MacOS 14.0+. ", \
+  "The MPS backend is supported on MacOS 13.0+.", \
  "Current OS version can be queried using `sw_vers`"
 #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
  "as the MPS framework doesn't support float64. Please use float32 instead."
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -375,7 +375,7 @@ static void bf16_gemv_trans(
  const at::BFloat16 beta,
  at::BFloat16* y,
  const int incy) {
-  bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
+  return bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }

 template <>
--- a/aten/src/ATen/native/BucketizationUtils.h
+++ b/aten/src/ATen/native/BucketizationUtils.h
@ -70,7 +70,7 @@ inline void searchsorted_maybe_trim_input_tensors(
    const Tensor& raw_boundaries) {
  Tensor trimmed_sorter;
  Tensor raw_sorter;
-  searchsorted_maybe_trim_input_tensors(
+  return searchsorted_maybe_trim_input_tensors(
      trimmed_input,
      trimmed_boundaries,
      trimmed_sorter,
--- a/aten/src/ATen/native/GridSamplerUtils.h
+++ b/aten/src/ATen/native/GridSamplerUtils.h
@ -93,12 +93,6 @@ inline bool cond_cudnn_grid_sampler(
  const TensorBase& input,
  const TensorBase& grid
 ) {
-  auto st = input.scalar_type();
-  if (!(st == kDouble || st == kFloat || st == kHalf))
-    return false;
-  st = grid.scalar_type();
-  if (!(st == kDouble || st == kFloat || st == kHalf))
-    return false;
  return (
    at::native::cudnn_is_acceptable(input) &&
    at::native::cudnn_is_acceptable(grid) &&
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -108,13 +108,6 @@ bool use_mkldnn(const Tensor& input, TensorList params, TensorList hx) {
  return false;
 }

-bool use_cudnn(const Tensor& t) {
-  bool acceptable = at::cudnn_is_acceptable(t);
-  auto st = t.scalar_type();
-  bool bfloat16_cond = st == kBFloat16 && at::detail::getCUDAHooks().supportsBFloat16RNNWithCuDNN();
-  return acceptable && (bfloat16_cond || st == kDouble || st == kFloat || st == kHalf);
-}
-
 template<typename T>
 using pair_of = std::pair<T, T>;

@ -1207,7 +1200,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
      bool train,                                                           \
      bool bidirectional,                                                   \
      bool batch_first) {                                                   \
-    if (use_cudnn(_input)) {                                                \
+    if (at::cudnn_is_acceptable(_input)) {                                  \
      Tensor output, hy;                                                    \
      NAME##_cudnn_stub(                                                    \
          _input.device().type(),                                           \
@ -1269,7 +1262,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
      double dropout_p,                                                     \
      bool train,                                                           \
      bool bidirectional) {                                                 \
-    if (use_cudnn(data)) {                                                  \
+    if (at::cudnn_is_acceptable(data)) {                                    \
      Tensor output, hy;                                                    \
      NAME##_packed_cudnn_stub(                                             \
          data.device().type(),                                             \
@ -1437,7 +1430,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
      TensorList _params, bool has_biases,
      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (use_cudnn(_input)) {
+  if (at::cudnn_is_acceptable(_input)) {
    Tensor output, hy, cy;
    lstm_cudnn_stub(_input.device().type(), output, hy, cy, _input, hx, _params, has_biases,
            num_layers, dropout_p, train, bidirectional, batch_first);
@ -1498,7 +1491,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
      TensorList _params, bool has_biases,
      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (use_cudnn(data)) {
+  if (at::cudnn_is_acceptable(data)) {
    Tensor output, hy, cy;
    lstm_packed_cudnn_stub(data.device().type(), output, hy, cy, data, batch_sizes, hx,
            _params, has_biases, num_layers, dropout_p, train, bidirectional);
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@ -23,6 +23,14 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_cast_Byte_native.h>
+#include <ATen/ops/_cast_Char_native.h>
+#include <ATen/ops/_cast_Double_native.h>
+#include <ATen/ops/_cast_Float_native.h>
+#include <ATen/ops/_cast_Half_native.h>
+#include <ATen/ops/_cast_Int_native.h>
+#include <ATen/ops/_cast_Long_native.h>
+#include <ATen/ops/_cast_Short_native.h>
 #include <ATen/ops/_dim_arange_native.h>
 #include <ATen/ops/_efficientzerotensor_native.h>
 #include <ATen/ops/_empty_affine_quantized.h>
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@ -91,6 +91,9 @@ bool cudnn_is_acceptable(const TensorBase& self) {
    return false;
  if (!self.is_cuda())
    return false;
+  auto st = self.scalar_type();
+  if (!(st == kDouble || st == kFloat || st == kHalf))
+    return false;
  if (!detail::getCUDAHooks().compiledWithCuDNN())
    return false;
  // cuDNN functions like grid_sampler returns CUDNN_STATUS_BAD_PARAM on empty
--- a/aten/src/ATen/native/VariableMethodStubs.cpp
+++ b/aten/src/ATen/native/VariableMethodStubs.cpp
@ -25,11 +25,11 @@
 namespace at::native {

 void _backward(const Tensor& self, TensorList inputs, const std::optional<Tensor>& gradient_opt, std::optional<bool> keep_graph, bool create_graph) {
-  self._backward(inputs, gradient_opt, keep_graph, create_graph);
+  return self._backward(inputs, gradient_opt, keep_graph, create_graph);
 }

 void set_data(Tensor& self, const Tensor& new_data) {
-  self.set_data(new_data);
+  return self.set_data(new_data);
 }

 Tensor data(const Tensor& self) {
@ -54,7 +54,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) {
 }

 void retain_grad(Tensor& self) {
-  self.retain_grad();
+  return self.retain_grad();
 }

 bool retains_grad(const Tensor& self) {
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@ -300,8 +300,7 @@ void div_floor_kernel(TensorIteratorBase& iter) {
    // In the special case of unsigned integer division, floor division is
    // equivalent to truncation division (since the signs of the divisor and
    // dividend are always the same)
-    div_trunc_kernel(iter);
-    return;
+    return div_trunc_kernel(iter);
  } else if (isIntegralType(dtype, /*includeBool*/ false)) {
    // There's no SIMD integer division, so don't try to vectorize it.
    AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cpu", [&]() {
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@ -749,29 +749,21 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
        // });

        if (iter_dtype == kByte) {
-          cpu_hflip_vec<uint8_t>(iter);
-          return;
+          return cpu_hflip_vec<uint8_t>(iter);
        } else if (iter_dtype == kChar) {
-          cpu_hflip_vec<int8_t>(iter);
-          return;
+          return cpu_hflip_vec<int8_t>(iter);
        } else if (iter_dtype == kInt) {
-          cpu_hflip_vec<int32_t>(iter);
-          return;
+          return cpu_hflip_vec<int32_t>(iter);
        } else if (iter_dtype == kLong) {
-          cpu_hflip_vec<int64_t>(iter);
-          return;
+          return cpu_hflip_vec<int64_t>(iter);
        } else if (iter_dtype == kShort) {
-          cpu_hflip_vec<int16_t>(iter);
-          return;
+          return cpu_hflip_vec<int16_t>(iter);
        } else if (iter_dtype == kBool) {
-          cpu_hflip_vec<bool>(iter);
-          return;
+          return cpu_hflip_vec<bool>(iter);
        } else if (iter_dtype == kFloat) {
-          cpu_hflip_vec<float>(iter);
-          return;
+          return cpu_hflip_vec<float>(iter);
        } else if (iter_dtype == kDouble) {
-          cpu_hflip_vec<double>(iter);
-          return;
+          return cpu_hflip_vec<double>(iter);
        }
      }
      // other dtypes (float16, bfloat16, complex) are handled by cpu_kernel_vec (see below)
@ -786,12 +778,10 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
          c == input_strides_2[1] &&
          c == iter.element_size(0) * iter.shape()[0]  // checks if dim=1 is contiguous as well
      ) {
-        cpu_hflip_channels_last_vec(iter);
-        return;
+        return cpu_hflip_channels_last_vec(iter);
      }
      // Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec)
-      cpu_vflip_memcpy(iter);
-      return;
+      return cpu_vflip_memcpy(iter);
    }

    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(), "flip_cpu",
--- a/aten/src/ATen/native/cpu/PowKernel.cpp
+++ b/aten/src/ATen/native/cpu/PowKernel.cpp
@ -96,14 +96,11 @@ static void pow_tensor_scalar_kernel(
      dtype == kBFloat16 || isComplexType(dtype)) {
    // Dispatch to fast specialization for sqrt, rsqrt and reciprocal
    if (exp_scalar.equal(.5)) {
-      sqrt_kernel(iter);
-      return;
+      return sqrt_kernel(iter);
    } else if (exp_scalar.equal(-0.5)) {
-      rsqrt_kernel(iter);
-      return;
+      return rsqrt_kernel(iter);
    } else if (exp_scalar.equal(-1.0)) {
-      reciprocal_kernel(iter);
-      return;
+      return reciprocal_kernel(iter);
    }
  }

--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -256,10 +256,10 @@ static void norm_kernel_tensor_iterator_impl(
  } else {
    if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) {
      // type promotion that does cast and reduction in a single kernel
-      norm_kernel_cpu_impl<at::Half, float>(iter, val); return;
+      return norm_kernel_cpu_impl<at::Half, float>(iter, val);
    } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
      // type promotion that does cast and reduction in a single kernel
-      norm_kernel_cpu_impl<at::BFloat16, float>(iter, val); return;
+      return norm_kernel_cpu_impl<at::BFloat16, float>(iter, val);
    }

    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kComplexHalf, iter.input_dtype(), "norm_cpu", [&] {
--- a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
@ -428,11 +428,10 @@ void fp16_gemv_trans(
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0);
 #if !defined(__aarch64__) || defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
  if (at::globalContext().allowFP16ReductionCPU()) {
-    fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
-    return;
+    return fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
  }
 #endif
-  fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+  return fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
 }

 float bf16_dot_with_fp32_arith(const at::BFloat16* vec1, const at::BFloat16* vec2, int64_t len) {
@ -466,7 +465,7 @@ void bf16_gemv_trans(
  at::BFloat16* y,
  const int incy) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0 && beta == 0.0);
-  bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
+  return bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
 }

 float fp16_dot(
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -285,8 +285,8 @@ static bool isSupportedHipLtROCmArch(int index) {
 #if ROCM_VERSION >= 60300
        "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
 #endif
-#if ROCM_VERSION >= 70000
-        "gfx950", "gfx1150", "gfx1151"
+#if ROCM_VERSION >= 60500
+        "gfx950"
 #endif
    };
    return at::detail::getCUDAHooks().isGPUArch(archs, index);
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@ -59,7 +59,7 @@ constexpr uint64_t getDefaultMaxThreadsPerBlock() {
 #ifdef USE_ROCM
 #define SKIP_SORTED_INDICES 32
 template <typename scalar_t, int SZ>
-__global__ void indexing_backward_kernel_many_indices(
+__global__ void indexing_backward_kernel(
  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
  int64_t numel, int64_t stride, int64_t stride_before, int64_t outer_dim, bool accumulate) {
  using opmath_t = at::opmath_type<scalar_t>;
@ -254,8 +254,7 @@ __global__ void indexing_backward_kernel_stride_1(
    }
  }
 }
-#endif
-
+#else
 template <typename scalar_t, int SZ>
 __global__ void indexing_backward_kernel(
  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
@ -334,7 +333,6 @@ __global__ void indexing_backward_kernel(
  }
 }

-#ifndef USE_ROCM
 template <typename scalar_t>
 __global__ void indexing_backward_kernel_stride_1(
  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
@ -782,43 +780,11 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
            kBool,
            kBFloat16);
        } else {
-#ifdef USE_ROCM
-          if (num_indices >= 200000)
-            AT_DISPATCH_V2(
-              expandedValue.scalar_type(),
-              "indexing_backward_many_indices",
-              AT_WRAP([&] {
-                indexing_backward_kernel_many_indices<scalar_t, UNROLL><<<new_grid, block, smem_dups_size, stream>>>(
-                  sorted_indices.const_data_ptr<int64_t>(),
-                  orig_indices.const_data_ptr<int64_t>(),
-                  expandedValue.const_data_ptr<scalar_t>(),
-                  src_.mutable_data_ptr<scalar_t>(),
-                  num_indices,
-                  sliceSize,
-                  strideBefore,
-                  nElemBefore,
-                  accumulate);
-                C10_CUDA_KERNEL_LAUNCH_CHECK();
-              }),
-              AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-              // AT_EXPAND(AT_FLOAT8_TYPES),
-              // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
-              // should not be supported here, then reenable AT_FLOAT8_DTYPES
-              kFloat8_e4m3fn,
-              kFloat8_e5m2,
-              kFloat8_e4m3fnuz,
-              kFloat8_e5m2fnuz,
-              kComplexHalf,
-              kHalf,
-              kBool,
-              kBFloat16);
-          else
-#endif
          AT_DISPATCH_V2(
            expandedValue.scalar_type(),
            "indexing_backward",
            AT_WRAP([&] {
-              indexing_backward_kernel<scalar_t, UNROLL><<<grid, block, 0, stream>>>(
+              indexing_backward_kernel<scalar_t, UNROLL><<<KERNEL_GRID, block, KERNEL_SMEM, stream>>>(
                sorted_indices.const_data_ptr<int64_t>(),
                orig_indices.const_data_ptr<int64_t>(),
                expandedValue.const_data_ptr<scalar_t>(),
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@ -121,7 +121,7 @@ void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_si
    "cufft_set_plan_cache_max_size: expected 0 <= device_index < ",
    at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
    device_index);
-  cufft_get_plan_cache(device_index).resize(max_size);
+  return cufft_get_plan_cache(device_index).resize(max_size);
 }

 int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index) {
@ -137,7 +137,7 @@ void cufft_clear_plan_cache_impl(DeviceIndex device_index) {
    "cufft_clear_plan_cache: expected 0 <= device_index < ",
    at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
    device_index);
-  cufft_get_plan_cache(device_index).clear();
+  return cufft_get_plan_cache(device_index).clear();
 }

 } // namespace at::native::detail
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@ -230,7 +230,7 @@ constexpr int BLOCK_THREADS = 256;
 constexpr int RADIX_BITS = 8;
 constexpr int RADIX_DIGITS = 1 << RADIX_BITS; // 2 ^ RADIX_BITS
 constexpr int RADIX_MASK = (RADIX_DIGITS - 1);
-static_assert(RADIX_DIGITS <= BLOCK_THREADS, "RADIX_DIGITS must be <= BLOCK_THREADS");
+static_assert(RADIX_DIGITS <= BLOCK_THREADS, "radixFindKthValues kernel requires RADIX_DIGITS <= BLOCK_THREADS");
 constexpr int MIN_ITEMS_PER_THREAD = 4;
 constexpr int MAX_ITEMS_PER_THREAD = 64;

@ -242,10 +242,11 @@ __global__ void fill(T* x, T value, IndexType size) {
  }
 }

-// compute local histogram for each block
+// find the kth smallest value,
+// for largest topk, k_to_find = slice_size - k + 1
 template <typename T, typename IndexType, typename Bitwise, int Dim>
 C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
-__global__ void computeBlockDigitCounts(
+__global__ void radixFindKthValues(
    at::cuda::detail::TensorInfo<const T, IndexType> input,
    uint32_t slice_size,
    uint32_t* ks_to_find,  // size: num_slices, unused arg but for mysterious reasons perf is better when it's present
@ -320,51 +321,12 @@ __global__ void computeBlockDigitCounts(
  }
 }

-// compute global histogram and cumsum for each row
-__global__ void computeDigitCumSum(
-  short* counts,
-  uint32_t* digit_cum_sum,
-  uint32_t blocks_per_slice) {
-  int tidx = threadIdx.x + blockIdx.x * blockDim.x;
-  int digit_idx = threadIdx.x;
-  uint32_t slice_idx = blockIdx.x;
-
-  typedef cub::BlockScan<uint32_t, RADIX_DIGITS> BlockScan;
-  __shared__ typename BlockScan::TempStorage scan_storage;
-  // accumulates counters from multiple blocks
-  uint32_t digit_count = 0;
-  if (threadIdx.x < RADIX_DIGITS) {
-    constexpr int HISTO_ACCUM_TILE = 4;
-    uint32_t rounds = blocks_per_slice / HISTO_ACCUM_TILE;
-    for (int iter = 0; iter < rounds; iter++)  {
-      int base = HISTO_ACCUM_TILE * iter;
-      #pragma unroll
-      for (int j = 0; j < HISTO_ACCUM_TILE; j++) {
-        int blk = base + j;
-        digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + digit_idx];
-      }
-    }
-    for (int blk = HISTO_ACCUM_TILE * rounds; blk < blocks_per_slice; blk++)  {
-      digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + digit_idx];
-    }
-
-  }
-  // compute the block-wide inclusive prefix sum
-  uint32_t digit_count_cumsum;
-  BlockScan(scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
-  __syncthreads();
-  if (threadIdx.x < RADIX_DIGITS) {
-    digit_cum_sum[tidx] = digit_count_cumsum;
-  }
-}
-
 // Assumption: k can not be larger than UINT32_MAX
 template <typename Bitwise, typename T>
 C10_LAUNCH_BOUNDS_1(RADIX_DIGITS)  // one thread per digit
 __global__ void computeBlockwiseWithinKCounts(
  Bitwise* desires_in,          // size: num_slices
  short* counts,             // size: num_slices * blocks_per_slice * radix_digits
-  uint32_t* digit_cum_sum,
  uint32_t* ks_to_find_in,  // size: num_slices
  uint32_t blocks_per_slice,
  int current_bit,
@ -376,7 +338,7 @@ __global__ void computeBlockwiseWithinKCounts(
  Bitwise* desires_out,
  uint32_t num_blocks
 ) {
-  // This kernel should be launched with the same number of blocks as the `computeBlockDigitCounts` kernel.
+  // This kernel should be launched with the same number of blocks as the `radixFindKthValues` kernel.
  int tidx = threadIdx.x;
  uint32_t block_idx = getLinearBlockId<uint32_t>();
  uint32_t slice_idx = block_idx / blocks_per_slice;
@ -389,15 +351,36 @@ __global__ void computeBlockwiseWithinKCounts(
  if (block_idx >= num_blocks) {
    return;
  }
+  typedef cub::BlockScan<uint32_t, BLOCK_THREADS> BlockScan;
+  union __align__(16) TempStorage {
+    uint32_t digit_count_cumsum[RADIX_DIGITS]; // only used if this it the last block for this slice
+    typename BlockScan::TempStorage scan_storage;
+  };
+  __shared__ TempStorage temp_storage;

+  // accumulates counters from multiple blocks
+  uint32_t digit_count = 0;
+  if (tidx < RADIX_DIGITS) {
+    for (int blk = 0; blk < blocks_per_slice; ++blk) {
+      digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + tidx];
+    }
+  }
+
+  // compute the block-wide inclusive prefix sum
+  uint32_t digit_count_cumsum;
+  BlockScan(temp_storage.scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
+  __syncthreads();
+  // every thread also need the perfix_sum of it's left value for comparison, so save a copy in shared mem
+  if (tidx < RADIX_DIGITS) {
+    temp_storage.digit_count_cumsum[tidx] = digit_count_cumsum;
+  }
+  __syncthreads();

  __shared__ Bitwise desired;
  uint32_t k_to_find = ks_to_find_in[slice_idx];

  if (tidx < RADIX_DIGITS) {
-    uint32_t position = slice_idx * RADIX_DIGITS + tidx;
-    uint32_t digit_count_cumsum = digit_cum_sum[position];
-    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : digit_cum_sum[position - 1];
+    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : temp_storage.digit_count_cumsum[tidx - 1];

    // if not the last pass: update desired and ks_to_find
    // if last pass: write out the kth value
@ -483,7 +466,7 @@ template <typename Bitwise>
 __global__ void computeBlockwiseKthCounts(
  Bitwise* desires,            // size: num_slices
  short* counts,               // size: num_slices * blocks_per_slice * radix_digits
-  uint32_t num_blocks,         // the number of blocks used by `computeBlockDigitCounts` kernel
+  uint32_t num_blocks,         // the number of blocks used by `radixFindKthValues` kernel
  uint32_t blocks_per_slice,
  // outputs:
  uint32_t* kthCounts          // size: num_slices * blocks_per_slice == num_blocks
@ -666,7 +649,9 @@ void launch(
  T* kthValues = reinterpret_cast<T*>(kthValues_buffer.get());

  TORCH_CHECK(blocks_per_slice <= std::numeric_limits<uint32_t>::max(), "blocks_per_slice larger than uint32 maximum is not supported");
-
+  auto semaphores_buffer = allocator.allocate(numInputSlices * sizeof(uint32_t));
+  uint32_t* semaphores = reinterpret_cast<uint32_t*>(semaphores_buffer.get());
+  AT_CUDA_CHECK(cudaMemsetAsync(semaphores, 0, numInputSlices * sizeof(uint32_t), stream));

  auto ks_to_find_buffer = allocator.allocate(2 * numInputSlices * sizeof(uint32_t));
  uint32_t* ks_to_find = reinterpret_cast<uint32_t*>(ks_to_find_buffer.get());
@ -683,10 +668,6 @@ void launch(
  static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits<short>::max(),
    "blockwise counter too large");

-  auto digit_cum_sum_buffer = allocator.allocate(numInputSlices * RADIX_DIGITS * sizeof(uint32_t));
-  uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get());
-  AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream));
-
 #if CUB_SUPPORTS_SCAN_BY_KEY()
  auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
  uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
@ -710,7 +691,7 @@ void launch(

  // iterate radix bits for multiple passes
  for (int current_bit = sizeof(T) * 8 - RADIX_BITS; current_bit >= 0; current_bit -= RADIX_BITS) {
-    computeBlockDigitCounts<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
+    radixFindKthValues<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
        input,
        inputSliceSize,
        ks_to_find_in, // unused arg
@ -723,14 +704,10 @@ void launch(
        desired_in,
        counts);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-    computeDigitCumSum<<<numInputSlices, RADIX_DIGITS, 0, stream>>>(counts, digit_cum_sum, blocks_per_slice);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-
    // we unconditionally call this kernel to update desired/ks_to_find/kthValues
    // if cub supports scan_by_key we additionally do k counts
    computeBlockwiseWithinKCounts<Bitwise, T><<<grid, RADIX_DIGITS, 0, stream>>>(
-      desired_in, counts, digit_cum_sum, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
+      desired_in, counts, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    // swap desired/ks_to_find in and out for next iter
    auto tmp_desired = desired_in;
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@ -1107,14 +1107,10 @@ void ldl_factor_kernel(
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Cusolver:
-       { ldl_factor_cusolver(
+      return ldl_factor_cusolver(
          LD, pivots, info, upper, hermitian);
-        return;
-}
    case at::LinalgBackend::Magma:
-       { ldl_factor_magma(LD, pivots, info, upper, hermitian);
-        return;
-}
+      return ldl_factor_magma(LD, pivots, info, upper, hermitian);
    default:
    // By default use cusolver if available and magma otherwise.
    // If cusolver and magma 2.5.4+ are both available and hermitian=true,
@ -1126,10 +1122,8 @@ void ldl_factor_kernel(
            LD, pivots, info, upper, hermitian);
      }
 #endif
-    { ldl_factor_cusolver(
-      LD, pivots, info, upper, hermitian);
-      return;
-    }
+      return ldl_factor_cusolver(
+          LD, pivots, info, upper, hermitian);
 #else
      return ldl_factor_magma(LD, pivots, info, upper, hermitian);
 #endif
@ -1845,14 +1839,11 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
      // For the benchmarks see
      // https://github.com/pytorch/pytorch/pull/56253#discussion_r622851107
      if (input.size(-2) <= 256 && batchCount(input) >= std::max<int64_t>(2, input.size(-2) / 16)) {
-        geqrf_batched_cublas(input, tau);
-        return;
+        return geqrf_batched_cublas(input, tau);
      } else {
-        geqrf_cusolver(input, tau);
-        return;
+        return geqrf_cusolver(input, tau);
      }
-      geqrf_batched_cublas(input, tau);
-      return;
+      return geqrf_batched_cublas(input, tau);
  };

  auto preferred_backend = at::globalContext().linalgPreferredBackend();
@ -1865,14 +1856,10 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
  // - ?geqrf_gpu allows fast computation of Q via ?orgqr_gpu, but doesn't give R properly.
  // - ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orgqr_gpu
    case at::LinalgBackend::Magma:
-      { geqrf_magma(input, tau);
-        return;
-      }
+      return geqrf_magma(input, tau);
    case at::LinalgBackend::Cusolver:
    default:
-      { geqrf_cusolver_backend(input, tau);
-        return;
-      }
+      return geqrf_cusolver_backend(input, tau);
  }
 #else
  return geqrf_magma(input, tau);
@ -2716,17 +2703,13 @@ void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Magma:
-      { gels_magma(a, b, infos);
-        return;
-      }
+      return gels_magma(a, b, infos);
    case at::LinalgBackend::Cusolver:
    default:
      // linalg_lstsq_gels is a generic function that is implemented using
      // geqrf_stub, ormqr_stub, and triangular_solve_stub
      // It dispatches to cuSOLVER for CUDA inputs if USE_LINALG_SOLVER is defined
-      { linalg_lstsq_gels(a, b, infos);
-        return;
-      }
+      return linalg_lstsq_gels(a, b, infos);
  }
 #else
  return gels_magma(a, b, infos);
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@ -1222,7 +1222,7 @@ cudnnRNNAlgo_t get_algo(
 }

 cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
-  if (dtype == CUDNN_DATA_HALF || dtype == CUDNN_DATA_BFLOAT16) {
+  if (dtype == CUDNN_DATA_HALF) {
    return CUDNN_DATA_FLOAT;
  }
  return dtype;
--- a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
@ -772,21 +772,13 @@ void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {

 template <>
 void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
-  static const std::vector<std::string> wmma_archs = {
-    "gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
-#if ROCM_VERSION >= 70000
-    "gfx1150", "gfx1151"
-#endif
-  };
-  if (at::detail::getCUDAHooks().isGPUArch(wmma_archs)) {
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  std::string_view arch(dprops->gcnArchName);
+  if (arch == "gfx1100") {
    dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGS(at::BFloat16));
-  }
-  else if (at::detail::getCUDAHooks().isGPUArch({"gfx9"})) {
+  } else{
    dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
  }
-  else {
-    TORCH_CHECK(false, "gemm_internal_ck<at::BFloat16> unsupported gfx arch");
-  }
 }

 } // namespace at::native
--- a/aten/src/ATen/native/hip/ck_gemm_half.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_half.hip
@ -599,21 +599,11 @@ void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {

 template <>
 void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
-  static const std::vector<std::string> wmma_archs = {
-    "gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
-#if ROCM_VERSION >= 70000
-    "gfx1150", "gfx1151"
-#endif
-  };
-  if (at::detail::getCUDAHooks().isGPUArch(wmma_archs)) {
+  if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) {
    dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGS(at::Half));
-  }
-  else if (at::detail::getCUDAHooks().isGPUArch({"gfx9"})) {
+  } else{
    dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
  }
-  else {
-    TORCH_CHECK(false, "gemm_internal_ck<at::Half> unsupported gfx arch");
-  }
 }

 } // namespace at::native
--- a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
@ -373,67 +373,59 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        addmm_dense_result(
+        return addmm_dense_result(
            mat2.transpose(0, 1).to_sparse_csr(),
            mat1.transpose(0, 1),
            beta,
            alpha,
            result.transpose(0, 1));
-            return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        addmm_dense_result(
+        return addmm_dense_result(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
            result.transpose(-2, -1));
-            return;
      }
    }
    if (mat2.layout() == kSparseBsc) {
      if (result.layout() == kStrided) {
-        addmm_dense_result(
+        return addmm_dense_result(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
            result.transpose(-2, -1));
-            return;
      }
    }
  }
  if (mat1.layout() == kSparseCsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        addmm_dense_result(mat1, mat2, beta, alpha, result);
-        return;
+        return addmm_dense_result(mat1, mat2, beta, alpha, result);
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
-        addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
-        return;
+        return addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
      }
      if (result.layout() == kSparseCsr) {
-        addmm_sparse_result(mat1, mat2, beta, alpha, result);
-        return;
+        return addmm_sparse_result(mat1, mat2, beta, alpha, result);
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
        // TODO: CSR @ CSC kernel would be very fast due to format alignment
-        addmm_sparse_input_dense_result(
-          mat1, mat2.to_sparse_csr(), beta, alpha, result);
-        return;
+        return addmm_sparse_input_dense_result(
+            mat1, mat2.to_sparse_csr(), beta, alpha, result);
      }
      if (result.layout() == kSparseCsr) {
        // TODO: CSR @ CSC kernel would be very fast due to format alignment
-        addmm_sparse_result(
-          mat1, mat2.to_sparse_csr(), beta, alpha, result);
-        return;
+        return addmm_sparse_result(
+            mat1, mat2.to_sparse_csr(), beta, alpha, result);
      }
    }
  }
@ -441,62 +433,56 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
        // TODO: avoid csc->csr conversion with native csc support
-        addmm_dense_result(
-          mat1.to_sparse_csr(), mat2, beta, alpha, result);
-        return;
+        return addmm_dense_result(
+            mat1.to_sparse_csr(), mat2, beta, alpha, result);
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr) {
        // TODO: avoid csc->csr conversion with native csc support
-        addmm_sparse_result(
-          mat1.to_sparse_csr(), mat2, beta, alpha, result);
-        return;
+        return addmm_sparse_result(
+            mat1.to_sparse_csr(), mat2, beta, alpha, result);
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        addmm_sparse_input_dense_result(
-          mat2.transpose(-2, -1),
-          mat1.transpose(-2, -1),
-          beta,
-          alpha,
-          result.transpose(-2, -1));
-        return;
+        return addmm_sparse_input_dense_result(
+            mat2.transpose(-2, -1),
+            mat1.transpose(-2, -1),
+            beta,
+            alpha,
+            result.transpose(-2, -1));
      }
      if (result.layout() == kSparseCsr) {
        // TODO avoid csc->csr
-        addmm_sparse_result(
-          mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
-        return;
+        return addmm_sparse_result(
+            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
      }
      if (result.layout() == kSparseCsc) {
-        addmm_sparse_result(
-          mat2.transpose(-2, -1),
-          mat1.transpose(-2, -1),
-          beta,
-          alpha,
-          result.transpose(-2, -1));
-        return;
+        return addmm_sparse_result(
+            mat2.transpose(-2, -1),
+            mat1.transpose(-2, -1),
+            beta,
+            alpha,
+            result.transpose(-2, -1));
      }
    }
  }
  if (mat1.layout() == kSparseBsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        addmm_dense_result(mat1, mat2, beta, alpha, result);
-        return;
+        return addmm_dense_result(mat1, mat2, beta, alpha, result);
      }
    }
  }
  TORCH_CHECK(
-    false,
-    "addmm: computation on CPU is not implemented for ",
-    result.layout(),
-    " + ",
-    mat1.layout(),
-    " @ ",
-    mat2.layout());
+      false,
+      "addmm: computation on CPU is not implemented for ",
+      result.layout(),
+      " + ",
+      mat1.layout(),
+      " @ ",
+      mat2.layout());
 }

 /*
@ -510,16 +496,16 @@ void addmm_out_sparse_csr(
               [out] result of the operation.
 */
 void addmv_out_sparse_csr(
-  const Tensor& mat,
-  const Tensor& vec,
-  const Scalar& beta,
-  const Scalar& alpha,
-  const Tensor& result) {
+    const Tensor& mat,
+    const Tensor& vec,
+    const Scalar& beta,
+    const Scalar& alpha,
+    const Tensor& result) {
 #if !AT_USE_MKL_SPARSE()
  TORCH_CHECK(
-    false,
-    "Calling addmv on a sparse CPU tensor requires Linux platform. ",
-    "Please use PyTorch built with MKL on Linux.");
+      false,
+      "Calling addmv on a sparse CPU tensor requires Linux platform. ",
+      "Please use PyTorch built with MKL on Linux.");
 #else
  c10::MaybeOwned<Tensor> result_ = prepare_dense_vector_for_mkl(result);
  c10::MaybeOwned<Tensor> vec_ = prepare_dense_vector_for_mkl(vec);
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -5,6 +5,38 @@
 # representing ScalarType's. They are now superseded by usage of
 # `aten::to()`. The ops remain here for backward compatibility purposes.

+# DEPRECATED. DO NOT USE
+- func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
 # Computes the gradient of current tensor w.r.t. graph leaves.
 - func: _backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()
  manual_cpp_binding: True
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
@ -810,8 +810,7 @@ void addmm_out_sparse_csr(
  if (mat1.layout() == kSparseBsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided)
-         { block_sparse_mm(input, mat1, mat2, beta, alpha, result); return;
-}
+        return block_sparse_mm(input, mat1, mat2, beta, alpha, result);
    }
  }

@ -820,13 +819,13 @@ void addmm_out_sparse_csr(
      if (result.layout() == kStrided) {
        auto result_t = result.transpose(-2, -1);
        auto input_t = (result.is_same(input) ? result_t : input.transpose(-2, -1));
-        block_sparse_mm(
+        return block_sparse_mm(
            input_t,
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result_t); return;
+            result_t);
      }
    }
  }
@ -841,41 +840,41 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        spmm(
+        return spmm(
            mat2.transpose(0, 1).to_sparse_csr(),
            mat1.transpose(0, 1),
            beta,
            alpha,
-            result.transpose(0, 1)); return;
+            result.transpose(0, 1));
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        spmm(
+        return spmm(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result.transpose(-2, -1)); return;
+            result.transpose(-2, -1));
      }
    }
  }
  if (mat1.layout() == kSparseCsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        spmm(mat1, mat2, beta, alpha, result); return;
+        return spmm(mat1, mat2, beta, alpha, result);
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr) {
-        spgemm(mat1, mat2, beta, alpha, result); return;
+        return spgemm(mat1, mat2, beta, alpha, result);
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kSparseCsr) {
        // TODO: Add native CSC support via cuSPARSE if supported.
        // CSR @ CSC kernel would be very fast due to format alignment
-        spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result); return;
+        return spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result);
      }
    }
  }
@ -883,28 +882,27 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
+        return spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr)
        // TODO: Add native CSC support via cuSPARSE if supported.
-         { spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
-}
+        return spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kSparseCsr) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        spgemm(
-            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result); return;
+        return spgemm(
+            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
      }
      if (result.layout() == kSparseCsc) {
-        spgemm(
+        return spgemm(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result.transpose(-2, -1)); return;
+            result.transpose(-2, -1));
      }
    }
  }
@ -935,7 +933,7 @@ void addmv_out_sparse_csr(
    const Scalar& alpha,
    const Tensor& result) {
  if (mat.layout() == kSparseBsr) {
-    block_sparse_mv(mat, vec, beta, alpha, result); return;
+    return block_sparse_mv(mat, vec, beta, alpha, result);
  }
  cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;

@ -1215,9 +1213,9 @@ void triangular_solve_out_sparse_csr(
  }
  if (A.layout() == kSparseBsr) {
    if (B.size(-1) == 1) {
-      block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular); return;
+      return block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular);
    } else {
-      block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular); return;
+      return block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular);
    }
  }
 #ifdef USE_ROCM
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
@ -117,7 +117,7 @@ class FwdKernel:
    def get_all(cls) -> list["FwdKernel"]:
        kernels: list[FwdKernel] = []
        for aligned, dtype, (sm, sm_max) in itertools.product(
-            [True, False], DTYPES.keys(), itertools.pairwise(SM)
+            [True, False], DTYPES.keys(), zip(SM, SM[1:])
        ):
            # Remove some kernels we don't use
            if dtype == "bf16" and sm < 80:
@ -228,7 +228,7 @@ class BwdKernel:
        for aligned, dtype, (sm, sm_max), apply_dropout, max_k in itertools.product(
            [True, False],
            DTYPES.keys(),
-            itertools.pairwise(SM),
+            zip(SM, SM[1:]),
            [True, False],
            [32, 64, 128, 2**16],
        ):
--- a/benchmarks/distributed/bench_nvshmem_tile_reduce.py
+++ b/benchmarks/distributed/bench_nvshmem_tile_reduce.py
@ -1,191 +0,0 @@
-#!/usr/bin/env python3
-"""
-Benchmark for NVSHMEM tile reduce operations.
-
-Usage:
-python benchmarks/distributed/bench_nvshmem_tile_reduce.py
-
-This benchmark measures the performance of tile reduce operations across different
-matrix sizes and tile configurations.
-"""
-
-import time
-
-import torch
-import torch.distributed as dist
-import torch.distributed._symmetric_memory as symm_mem
-from torch.testing._internal.common_distributed import MultiProcContinuousTest
-from torch.testing._internal.common_utils import (
-    requires_cuda_p2p_access,
-    skip_but_pass_in_sandcastle_if,
-    skipIfRocm,
-)
-
-
-# Decorator
-def requires_nvshmem():
-    return skip_but_pass_in_sandcastle_if(
-        not symm_mem.is_nvshmem_available(),
-        "bench_nvshmem_tile_reduce requires NVSHMEM, skipping benchmark",
-    )
-
-
-# So that benchmarks are written in device-agnostic way
-device_type = "cuda"
-device_module = torch.get_device_module(device_type)
-
-
-@requires_nvshmem()
-@requires_cuda_p2p_access()
-class NVSHMEMTileReduceBenchmark(MultiProcContinuousTest):
-    def _init_device(self) -> None:
-        # TODO: relieve this (seems to hang if without)
-        device_module.set_device(self.device)
-        # Set NVSHMEM as SymmMem backend
-        symm_mem.set_backend("NVSHMEM")
-
-    @property
-    def device(self) -> torch.device:
-        return torch.device(device_type, self.rank)
-
-    def _benchmark_tile_reduce_single(
-        self,
-        full_size: int,
-        tile_size: int,
-        warmup_iters: int = 5,
-        bench_iters: int = 10,
-    ) -> dict:
-        """
-        Benchmark a single configuration of tile reduce.
-
-        Args:
-            full_size: Size of the full matrix (full_size x full_size)
-            warmup_iters: Number of warmup iterations
-            bench_iters: Number of benchmark iterations
-
-        Returns:
-            Dictionary with benchmark results
-        """
-        self._init_device()
-        group_name = dist.group.WORLD.group_name
-        symm_mem.enable_symm_mem_for_group(group_name)
-
-        dtype = torch.float
-
-        # Allocate full matrices
-        full_inp = symm_mem.empty(
-            full_size, full_size, dtype=dtype, device=self.device
-        ).fill_(self.rank)
-        full_out = symm_mem.empty(
-            full_size, full_size, dtype=dtype, device=self.device
-        ).fill_(0)
-
-        slice_ut = slice(0, tile_size)
-        inp_tile = full_inp[slice_ut, slice_ut]
-        out_tile = full_out[slice_ut, slice_ut]
-
-        root = 0
-
-        # Warmup iterations
-        for _ in range(warmup_iters):
-            torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
-            torch.cuda.synchronize(self.device)
-
-        # Benchmark iterations
-        times = []
-
-        dist.barrier()
-        torch.cuda.synchronize(self.device)
-        start_time = time.perf_counter()
-
-        for _ in range(bench_iters):
-            torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
-
-        torch.cuda.synchronize(self.device)
-        end_time = time.perf_counter()
-        times.append((end_time - start_time) / bench_iters)
-
-        # Calculate statistics
-        times = torch.tensor(times, dtype=torch.float64)
-        tile_elements = tile_size * tile_size
-        tile_bytes = (
-            tile_elements * dtype.itemsize
-            if hasattr(dtype, "itemsize")
-            else tile_elements * 4
-        )
-
-        results = {
-            "full_size": full_size,
-            "tile_size": tile_size,
-            "tile_elements": tile_elements,
-            "tile_bytes": tile_bytes,
-            "world_size": self.world_size,
-            "mean_time_ms": times.mean().item() * 1000,
-            "std_time_ms": times.std().item() * 1000,
-            "min_time_ms": times.min().item() * 1000,
-            "max_time_ms": times.max().item() * 1000,
-            "throughput_gb_s": tile_bytes / (times.mean().item() * 1e9),
-            "elements_per_sec": tile_elements / times.mean().item(),
-        }
-
-        return results
-
-    @skipIfRocm
-    def test_benchmark_tile_reduce_various_sizes(self) -> None:
-        """
-        Benchmark tile reduce across various matrix sizes.
-        """
-        # Test various matrix sizes
-        tile_sizes = [512, 1024, 2048, 4096, 8192, 16384]
-        full_size = tile_sizes[-1]
-        warmup_iters = 5
-        bench_iters = 20
-
-        results = []
-
-        for tile_size in tile_sizes:
-            try:
-                result = self._benchmark_tile_reduce_single(
-                    full_size, tile_size, warmup_iters, bench_iters
-                )
-                results.append(result)
-
-                if self.rank == 0:
-                    print(
-                        f"Matrix Size: {full_size}x{full_size}, Tile Size: {tile_size}x{tile_size}"
-                    )
-                    print(
-                        f"  Mean Time: {result['mean_time_ms']:.3f} ± {result['std_time_ms']:.3f} ms"
-                    )
-                    print(f"  Throughput: {result['throughput_gb_s']:.2f} GB/s")
-                    print(f"  Bytes: {result['tile_bytes']:.0f}")
-                    print()
-
-            except Exception as e:
-                if self.rank == 0:
-                    print(f"Failed to benchmark matrix size {full_size}: {e}")
-
-        # Print summary
-        if self.rank == 0 and results:
-            print("=== BENCHMARK SUMMARY ===")
-            print(
-                f"{'Matrix Size':<12} {'Tile Size':<10} {'Time (ms)':<12} {'Throughput (GB/s)':<18} {'Bytes':<15}"
-            )
-            print("-" * 70)
-
-            for result in results:
-                print(
-                    f"{result['full_size']}x{result['full_size']:<7} "
-                    f"{result['tile_size']}x{result['tile_size']:<5} "
-                    f"{result['mean_time_ms']:<12.3f} "
-                    f"{result['throughput_gb_s']:<18.2f} "
-                    f"{result['tile_bytes']:<15.0f}"
-                )
-
-
-if __name__ == "__main__":
-    # For standalone usage, you'd need to set up distributed environment
-    # For now, this is meant to be run via the PyTorch test framework
-    from torch.testing._internal.common_utils import run_tests
-
-    run_tests()
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@ -10,6 +10,7 @@ import pandas as pd

 flaky_models = {
    "yolov3",
+    "gluon_inception_v3",
    "detectron2_maskrcnn_r_101_c4",
    "timm_efficientnet",  # see https://github.com/pytorch/pytorch/issues/148699
    "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
@ -35,11 +36,15 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
            {
                "Background_Matting",
                "alexnet",
+                "cait_m36_384",
+                "dla102",
                "demucs",
                "densenet121",
                "detectron2_fcos_r_50_fpn",
                "doctr_det_predictor",
                "doctr_reco_predictor",
+                "dpn107",
+                "fbnetv3_b",
                "hf_BigBird",
                "hf_Longformer",
                "hf_Reformer",
@ -47,6 +52,7 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "hf_T5",
                "hf_T5_base",
                "hf_T5_generate",
+                "levit_128",
                "llava",
                "microbench_unbacked_tolist_sum",
                "mnasnet1_0",
@ -63,6 +69,7 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "squeezenet1_1",
                "stable_diffusion_text_encoder",
                "stable_diffusion_unet",
+                "swsl_resnext101_32x16d",
                "timm_efficientdet",
                "timm_efficientnet",
                "timm_nfnet",
--- a/benchmarks/dynamo/check_graph_breaks.py
+++ b/benchmarks/dynamo/check_graph_breaks.py
@ -10,6 +10,7 @@ import pandas as pd

 flaky_models = {
    "yolov3",
+    "gluon_inception_v3",
    "detectron2_maskrcnn_r_101_c4",
    "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
    "detectron2_fcos_r_50_fpn",
@ -31,6 +32,7 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
        flaky_models.update(
            {
                "alexnet",
+                "cait_m36_384",
                "demucs",
                "densenet121",
                "detectron2_fcos_r_50_fpn",
@ -42,6 +44,7 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                "hf_Roberta_base",
                "hf_T5",
                "hf_T5_base",
+                "levit_128",
                "llava",
                "microbench_unbacked_tolist_sum",
                "resnet50",
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,0



+AlbertForQuestionAnswering,pass,0
+
+
+
 AllenaiLongformerBase,pass,4


@ -14,22 +18,50 @@ BartForCausalLM,pass,0



+BartForConditionalGeneration,pass,0
+
+
+
 BertForMaskedLM,pass,0



+BertForQuestionAnswering,pass,0
+
+
+
 BlenderbotForCausalLM,pass_due_to_skip,0



+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,pass,0
+
+
+
 DistilBertForMaskedLM,pass,0



+DistilBertForQuestionAnswering,pass,0
+
+
+
 DistillGPT2,pass,2


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,0



+ElectraForQuestionAnswering,pass,0
+
+
+
 GPT2ForSequenceClassification,pass,0


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,0



+LayoutLMForSequenceClassification,pass,0
+
+
+
 M2M100ForConditionalGeneration,pass,0


@ -58,6 +98,10 @@ MBartForCausalLM,pass,0



+MBartForConditionalGeneration,pass,0
+
+
+
 MT5ForConditionalGeneration,pass,0


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,0



+MegatronBertForQuestionAnswering,pass,0
+
+
+
 MobileBertForMaskedLM,pass,0



+MobileBertForQuestionAnswering,pass,0
+
+
+
 OPTForCausalLM,pass,0


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,0



+PLBartForConditionalGeneration,pass,0
+
+
+
 PegasusForCausalLM,pass,0



+PegasusForConditionalGeneration,pass,0
+
+
+
 RobertaForCausalLM,pass,0



+RobertaForQuestionAnswering,pass,0
+
+
+
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,4



+AlbertForQuestionAnswering,pass,5
+
+
+
 AllenaiLongformerBase,pass,9


@ -14,22 +18,50 @@ BartForCausalLM,pass,6



+BartForConditionalGeneration,pass,8
+
+
+
 BertForMaskedLM,pass,5



+BertForQuestionAnswering,pass,5
+
+
+
 BlenderbotForCausalLM,eager_fail_to_run,0



+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
+
+
+
 DistilBertForMaskedLM,pass,5



+DistilBertForQuestionAnswering,pass,5
+
+
+
 DistillGPT2,pass,7


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,4



+ElectraForQuestionAnswering,pass,5
+
+
+
 GPT2ForSequenceClassification,pass,6


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,5



+LayoutLMForSequenceClassification,pass,6
+
+
+
 M2M100ForConditionalGeneration,pass,4


@ -58,6 +98,10 @@ MBartForCausalLM,pass,6



+MBartForConditionalGeneration,pass,8
+
+
+
 MT5ForConditionalGeneration,pass,5


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,5



+MegatronBertForQuestionAnswering,pass,5
+
+
+
 MobileBertForMaskedLM,pass,3



+MobileBertForQuestionAnswering,pass,3
+
+
+
 OPTForCausalLM,pass,8


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,6



+PLBartForConditionalGeneration,pass,8
+
+
+
 PegasusForCausalLM,pass,6



+PegasusForConditionalGeneration,pass,7
+
+
+
 RobertaForCausalLM,pass,5



+RobertaForQuestionAnswering,pass,5
+
+
+
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_inference.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,0



+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+
+
+
 deit_base_distilled_patch16_224,pass,0



+dla102,pass,0
+
+
+
 dm_nfnet_f0,pass,0



+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
 ghostnet_100,pass,0



+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
 inception_v3,pass,0



+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
 mobilenetv2_100,pass,0


@ -42,16 +146,100 @@ nfnet_l0,pass,0



+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
 repvgg_a2,pass,0



+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
 swin_base_patch4_window7_224,pass,0



+swsl_resnext101_32x16d,pass,0
+
+
+
 tf_efficientnet_b0,pass,0



+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
 visformer_small,pass,0
+
+
+
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,7



+botnet26t_256,pass,6
+
+
+
+cait_m36_384,eager_fail_to_run,0
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+
+
+
 deit_base_distilled_patch16_224,pass,7



+dla102,pass,7
+
+
+
 dm_nfnet_f0,pass,6



+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
 ghostnet_100,pass,6



+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
 inception_v3,pass,6



+jx_nest_base,pass,7
+
+
+
+lcnet_050,fail_accuracy,6
+
+
+
+levit_128,pass,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
 mobilenetv2_100,pass,7


@ -42,16 +146,100 @@ nfnet_l0,pass,7



+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
 repvgg_a2,pass,7



+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+
+
+
 swin_base_patch4_window7_224,pass,7



+swsl_resnext101_32x16d,pass,6
+
+
+
 tf_efficientnet_b0,pass,6



+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
 visformer_small,pass,7
+
+
+
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@ -6,26 +6,58 @@ AlbertForMaskedLM,pass,0



+AlbertForQuestionAnswering,pass,0
+
+
+
 BartForCausalLM,pass,0



+BartForConditionalGeneration,pass,0
+
+
+
 BertForMaskedLM,pass,0



+BertForQuestionAnswering,pass,0
+
+
+
 BlenderbotForCausalLM,pass_due_to_skip,0



+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,pass,0
+
+
+
 DistilBertForMaskedLM,pass,0



+DistilBertForQuestionAnswering,pass,0
+
+
+
 DistillGPT2,pass,0


@ -34,6 +66,10 @@ ElectraForCausalLM,pass,0



+ElectraForQuestionAnswering,pass,0
+
+
+
 GPT2ForSequenceClassification,pass,0


@ -46,6 +82,10 @@ LayoutLMForMaskedLM,pass,0



+LayoutLMForSequenceClassification,pass,0
+
+
+
 M2M100ForConditionalGeneration,pass,0


@ -54,6 +94,10 @@ MBartForCausalLM,pass,0



+MBartForConditionalGeneration,pass,0
+
+
+
 MT5ForConditionalGeneration,pass,0


@ -62,10 +106,18 @@ MegatronBertForCausalLM,pass,0



+MegatronBertForQuestionAnswering,pass,0
+
+
+
 MobileBertForMaskedLM,pass,0



+MobileBertForQuestionAnswering,pass,0
+
+
+
 OPTForCausalLM,pass,0


@ -74,14 +126,26 @@ PLBartForCausalLM,pass,0



+PLBartForConditionalGeneration,pass,0
+
+
+
 PegasusForCausalLM,pass,0



+PegasusForConditionalGeneration,pass,0
+
+
+
 RobertaForCausalLM,pass,0



+RobertaForQuestionAnswering,pass,0
+
+
+
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,0



+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+
+
+
 deit_base_distilled_patch16_224,pass,0



+dla102,pass,0
+
+
+
 dm_nfnet_f0,pass,0



+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
 ghostnet_100,pass,0



+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
 inception_v3,pass,0



+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
 mobilenetv2_100,pass,0


@ -42,16 +146,100 @@ nfnet_l0,pass,0



+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
 repvgg_a2,pass,0



+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
 swin_base_patch4_window7_224,pass,0



+swsl_resnext101_32x16d,pass,0
+
+
+
 tf_efficientnet_b0,pass,0



+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
 visformer_small,pass,0
+
+
+
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
@ -6,26 +6,58 @@ AlbertForMaskedLM,pass,0



+AlbertForQuestionAnswering,pass,0
+
+
+
 BartForCausalLM,pass,0



+BartForConditionalGeneration,pass,0
+
+
+
 BertForMaskedLM,pass,0



+BertForQuestionAnswering,pass,0
+
+
+
 BlenderbotForCausalLM,pass_due_to_skip,0



+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,pass,0
+
+
+
 DistilBertForMaskedLM,pass,0



+DistilBertForQuestionAnswering,pass,0
+
+
+
 DistillGPT2,pass,0


@ -34,6 +66,10 @@ ElectraForCausalLM,pass,0



+ElectraForQuestionAnswering,pass,0
+
+
+
 GPT2ForSequenceClassification,pass,0


@ -46,6 +82,10 @@ LayoutLMForMaskedLM,pass,0



+LayoutLMForSequenceClassification,pass,0
+
+
+
 M2M100ForConditionalGeneration,pass,0


@ -54,6 +94,10 @@ MBartForCausalLM,pass,0



+MBartForConditionalGeneration,pass,0
+
+
+
 MT5ForConditionalGeneration,pass,0


@ -62,10 +106,18 @@ MegatronBertForCausalLM,pass,0



+MegatronBertForQuestionAnswering,pass,0
+
+
+
 MobileBertForMaskedLM,pass,0



+MobileBertForQuestionAnswering,pass,0
+
+
+
 OPTForCausalLM,pass,0


@ -74,14 +126,26 @@ PLBartForCausalLM,pass,0



+PLBartForConditionalGeneration,pass,0
+
+
+
 PegasusForCausalLM,pass,0



+PegasusForConditionalGeneration,pass,0
+
+
+
 RobertaForCausalLM,pass,0



+RobertaForQuestionAnswering,pass,0
+
+
+
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,0



+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+
+
+
 deit_base_distilled_patch16_224,pass,0



+dla102,pass,0
+
+
+
 dm_nfnet_f0,pass,0



+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
 ghostnet_100,pass,0



+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
 inception_v3,pass,0



+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
 mobilenetv2_100,pass,0


@ -42,16 +146,100 @@ nfnet_l0,pass,0



+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
 repvgg_a2,pass,0



+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
 swin_base_patch4_window7_224,pass,0



+swsl_resnext101_32x16d,pass,0
+
+
+
 tf_efficientnet_b0,pass,0



+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
 visformer_small,pass,0
+
+
+
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,0



+AlbertForQuestionAnswering,pass,0
+
+
+
 AllenaiLongformerBase,pass,4


@ -14,22 +18,50 @@ BartForCausalLM,pass,0



+BartForConditionalGeneration,pass,0
+
+
+
 BertForMaskedLM,pass,0



+BertForQuestionAnswering,pass,0
+
+
+
 BlenderbotForCausalLM,pass_due_to_skip,0



+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,pass,0
+
+
+
 DistilBertForMaskedLM,pass,0



+DistilBertForQuestionAnswering,pass,0
+
+
+
 DistillGPT2,pass,2


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,0



+ElectraForQuestionAnswering,pass,0
+
+
+
 GPT2ForSequenceClassification,pass,0


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,0



+LayoutLMForSequenceClassification,pass,0
+
+
+
 M2M100ForConditionalGeneration,pass,0


@ -58,6 +98,10 @@ MBartForCausalLM,pass,0



+MBartForConditionalGeneration,pass,0
+
+
+
 MT5ForConditionalGeneration,pass,0


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,0



+MegatronBertForQuestionAnswering,pass,0
+
+
+
 MobileBertForMaskedLM,pass,0



+MobileBertForQuestionAnswering,pass,0
+
+
+
 OPTForCausalLM,pass,0


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,0



+PLBartForConditionalGeneration,pass,0
+
+
+
 PegasusForCausalLM,pass,0



+PegasusForConditionalGeneration,pass,0
+
+
+
 RobertaForCausalLM,pass,0



+RobertaForQuestionAnswering,pass,0
+
+
+
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,0



+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+
+
+
 deit_base_distilled_patch16_224,pass,0



+dla102,timeout,0
+
+
+
 dm_nfnet_f0,pass,0



+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
 ghostnet_100,pass,0



+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
 inception_v3,pass,0



+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
 mobilenetv2_100,pass,0


@ -42,16 +146,100 @@ nfnet_l0,pass,0



+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
 repvgg_a2,pass,0



+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
 swin_base_patch4_window7_224,pass,0



+swsl_resnext101_32x16d,pass,0
+
+
+
 tf_efficientnet_b0,pass,0



+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
 visformer_small,pass,0
+
+
+
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,0



+AlbertForQuestionAnswering,pass,0
+
+
+
 AllenaiLongformerBase,pass,4


@ -14,22 +18,50 @@ BartForCausalLM,pass,0



+BartForConditionalGeneration,pass,0
+
+
+
 BertForMaskedLM,pass,0



+BertForQuestionAnswering,pass,0
+
+
+
 BlenderbotForCausalLM,pass_due_to_skip,0



+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,pass,0
+
+
+
 DistilBertForMaskedLM,pass,0



+DistilBertForQuestionAnswering,pass,0
+
+
+
 DistillGPT2,pass,2


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,0



+ElectraForQuestionAnswering,pass,0
+
+
+
 GPT2ForSequenceClassification,pass,0


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,0



+LayoutLMForSequenceClassification,pass,0
+
+
+
 M2M100ForConditionalGeneration,pass,0


@ -58,6 +98,10 @@ MBartForCausalLM,pass,0



+MBartForConditionalGeneration,pass,0
+
+
+
 MT5ForConditionalGeneration,pass,0


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,0



+MegatronBertForQuestionAnswering,pass,0
+
+
+
 MobileBertForMaskedLM,pass,0



+MobileBertForQuestionAnswering,pass,0
+
+
+
 OPTForCausalLM,pass,0


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,0



+PLBartForConditionalGeneration,pass,0
+
+
+
 PegasusForCausalLM,pass,0



+PegasusForConditionalGeneration,pass,0
+
+
+
 RobertaForCausalLM,pass,0



+RobertaForQuestionAnswering,pass,0
+
+
+
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,0



+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+
+
+
 deit_base_distilled_patch16_224,pass,0



+dla102,timeout,0
+
+
+
 dm_nfnet_f0,pass,0



+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
 ghostnet_100,pass,0



+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
 inception_v3,pass,0



+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
 mobilenetv2_100,pass,0


@ -42,16 +146,100 @@ nfnet_l0,pass,0



+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
 repvgg_a2,pass,0



+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
 swin_base_patch4_window7_224,pass,0



+swsl_resnext101_32x16d,pass,0
+
+
+
 tf_efficientnet_b0,pass,0



+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
 visformer_small,pass,0
+
+
+
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,0



+AlbertForQuestionAnswering,pass,0
+
+
+
 AllenaiLongformerBase,pass,4


@ -14,22 +18,50 @@ BartForCausalLM,pass,0



+BartForConditionalGeneration,pass,0
+
+
+
 BertForMaskedLM,pass,0



+BertForQuestionAnswering,pass,0
+
+
+
 BlenderbotForCausalLM,pass_due_to_skip,0



+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,pass,0
+
+
+
 DistilBertForMaskedLM,pass,0



+DistilBertForQuestionAnswering,pass,0
+
+
+
 DistillGPT2,pass,2


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,0



+ElectraForQuestionAnswering,pass,0
+
+
+
 GPT2ForSequenceClassification,pass,0


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,0



+LayoutLMForSequenceClassification,pass,0
+
+
+
 M2M100ForConditionalGeneration,pass,0


@ -58,6 +98,10 @@ MBartForCausalLM,pass,0



+MBartForConditionalGeneration,pass,0
+
+
+
 MT5ForConditionalGeneration,pass,0


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,0



+MegatronBertForQuestionAnswering,pass,0
+
+
+
 MobileBertForMaskedLM,pass,0



+MobileBertForQuestionAnswering,pass,0
+
+
+
 OPTForCausalLM,pass,0


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,0



+PLBartForConditionalGeneration,pass,0
+
+
+
 PegasusForCausalLM,pass,0



+PegasusForConditionalGeneration,pass,0
+
+
+
 RobertaForCausalLM,pass,0



+RobertaForQuestionAnswering,pass,0
+
+
+
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_timm_inference.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,0



+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+
+
+
 deit_base_distilled_patch16_224,pass,0



+dla102,pass,0
+
+
+
 dm_nfnet_f0,pass,0



+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
 ghostnet_100,pass,0



+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
 inception_v3,pass,0



+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
 mobilenetv2_100,pass,0


@ -42,16 +146,100 @@ nfnet_l0,pass,0



+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
 repvgg_a2,pass,0



+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
 swin_base_patch4_window7_224,pass,0



+swsl_resnext101_32x16d,pass,0
+
+
+
 tf_efficientnet_b0,pass,0



+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
 visformer_small,pass,0
+
+
+
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,0



+AlbertForQuestionAnswering,pass,0
+
+
+
 AllenaiLongformerBase,pass,4


@ -14,22 +18,50 @@ BartForCausalLM,pass,0



+BartForConditionalGeneration,pass,0
+
+
+
 BertForMaskedLM,pass,0



+BertForQuestionAnswering,pass,0
+
+
+
 BlenderbotForCausalLM,pass_due_to_skip,0



+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,pass,0
+
+
+
 DistilBertForMaskedLM,pass,0



+DistilBertForQuestionAnswering,pass,0
+
+
+
 DistillGPT2,pass,2


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,0



+ElectraForQuestionAnswering,pass,0
+
+
+
 GPT2ForSequenceClassification,pass,0


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,0



+LayoutLMForSequenceClassification,pass,0
+
+
+
 M2M100ForConditionalGeneration,pass,0


@ -58,6 +98,10 @@ MBartForCausalLM,pass,0



+MBartForConditionalGeneration,pass,0
+
+
+
 MT5ForConditionalGeneration,pass,0


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,0



+MegatronBertForQuestionAnswering,pass,0
+
+
+
 MobileBertForMaskedLM,pass,0



+MobileBertForQuestionAnswering,pass,0
+
+
+
 OPTForCausalLM,pass,0


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,0



+PLBartForConditionalGeneration,pass,0
+
+
+
 PegasusForCausalLM,pass,0



+PegasusForConditionalGeneration,pass,0
+
+
+
 RobertaForCausalLM,pass,0



+RobertaForQuestionAnswering,pass,0
+
+
+
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,4



+AlbertForQuestionAnswering,pass,5
+
+
+
 AllenaiLongformerBase,pass,9


@ -14,22 +18,50 @@ BartForCausalLM,pass,6



+BartForConditionalGeneration,pass,8
+
+
+
 BertForMaskedLM,pass,5



+BertForQuestionAnswering,pass,5
+
+
+
 BlenderbotForCausalLM,eager_fail_to_run,0



+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
+
+
+
 DistilBertForMaskedLM,pass,5



+DistilBertForQuestionAnswering,pass,5
+
+
+
 DistillGPT2,pass,7


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,4



+ElectraForQuestionAnswering,pass,5
+
+
+
 GPT2ForSequenceClassification,pass,6


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,5



+LayoutLMForSequenceClassification,pass,6
+
+
+
 M2M100ForConditionalGeneration,pass,4


@ -58,6 +98,10 @@ MBartForCausalLM,pass,6



+MBartForConditionalGeneration,pass,8
+
+
+
 MT5ForConditionalGeneration,pass,5


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,5



+MegatronBertForQuestionAnswering,pass,5
+
+
+
 MobileBertForMaskedLM,pass,3



+MobileBertForQuestionAnswering,pass,3
+
+
+
 OPTForCausalLM,pass,8


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,6



+PLBartForConditionalGeneration,pass,8
+
+
+
 PegasusForCausalLM,pass,6



+PegasusForConditionalGeneration,pass,7
+
+
+
 RobertaForCausalLM,pass,5



+RobertaForQuestionAnswering,pass,5
+
+
+
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_inference.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,0



+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+
+
+
 deit_base_distilled_patch16_224,pass,0



+dla102,pass,0
+
+
+
 dm_nfnet_f0,pass,0



+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
 ghostnet_100,pass,0



+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
 inception_v3,pass,0



+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
 mobilenetv2_100,pass,0


@ -42,16 +146,100 @@ nfnet_l0,pass,0



+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
 repvgg_a2,pass,0



+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
 swin_base_patch4_window7_224,pass,0



+swsl_resnext101_32x16d,pass,0
+
+
+
 tf_efficientnet_b0,pass,0



+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
 visformer_small,pass,0
+
+
+
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,7



+botnet26t_256,pass,6
+
+
+
+cait_m36_384,eager_fail_to_run,0
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+
+
+
 deit_base_distilled_patch16_224,pass,7



+dla102,pass,7
+
+
+
 dm_nfnet_f0,pass,6



+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
 ghostnet_100,pass,6



+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
 inception_v3,pass,6



+jx_nest_base,pass,7
+
+
+
+lcnet_050,fail_accuracy,6
+
+
+
+levit_128,pass,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
 mobilenetv2_100,pass,7


@ -42,16 +146,100 @@ nfnet_l0,pass,7



+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
 repvgg_a2,pass,7



+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+
+
+
 swin_base_patch4_window7_224,pass,7



+swsl_resnext101_32x16d,pass,6
+
+
+
 tf_efficientnet_b0,pass,6



+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
 visformer_small,pass,7
+
+
+
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,0



+AlbertForQuestionAnswering,pass,0
+
+
+
 AllenaiLongformerBase,pass,4


@ -14,22 +18,50 @@ BartForCausalLM,pass,0



+BartForConditionalGeneration,pass,0
+
+
+
 BertForMaskedLM,pass,0



+BertForQuestionAnswering,pass,0
+
+
+
 BlenderbotForCausalLM,pass_due_to_skip,0



+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,pass,0
+
+
+
 DistilBertForMaskedLM,pass,0



+DistilBertForQuestionAnswering,pass,0
+
+
+
 DistillGPT2,pass,2


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,0



+ElectraForQuestionAnswering,pass,0
+
+
+
 GPT2ForSequenceClassification,pass,0


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,0



+LayoutLMForSequenceClassification,pass,0
+
+
+
 M2M100ForConditionalGeneration,pass,0


@ -58,6 +98,10 @@ MBartForCausalLM,pass,0



+MBartForConditionalGeneration,pass,0
+
+
+
 MT5ForConditionalGeneration,pass,0


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,0



+MegatronBertForQuestionAnswering,pass,0
+
+
+
 MobileBertForMaskedLM,pass,0



+MobileBertForQuestionAnswering,pass,0
+
+
+
 OPTForCausalLM,pass,0


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,0



+PLBartForConditionalGeneration,pass,0
+
+
+
 PegasusForCausalLM,pass,0



+PegasusForConditionalGeneration,pass,0
+
+
+
 RobertaForCausalLM,pass,0



+RobertaForQuestionAnswering,pass,0
+
+
+
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_timm_inference.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,0



+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+
+
+
 deit_base_distilled_patch16_224,pass,0



+dla102,pass,0
+
+
+
 dm_nfnet_f0,pass,0



+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
 ghostnet_100,pass,0



+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
 inception_v3,pass,0



+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
 mobilenetv2_100,pass,0


@ -42,16 +146,100 @@ nfnet_l0,pass,0



+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
 repvgg_a2,pass,0



+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
 swin_base_patch4_window7_224,pass,0



+swsl_resnext101_32x16d,pass,0
+
+
+
 tf_efficientnet_b0,pass,0



+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
 visformer_small,pass,0
+
+
+
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,0



+AlbertForQuestionAnswering,pass,0
+
+
+
 AllenaiLongformerBase,pass,4


@ -14,22 +18,50 @@ BartForCausalLM,pass,0



+BartForConditionalGeneration,pass,0
+
+
+
 BertForMaskedLM,pass,0



+BertForQuestionAnswering,pass,0
+
+
+
 BlenderbotForCausalLM,pass_due_to_skip,0



+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,pass,0
+
+
+
 DistilBertForMaskedLM,pass,0



+DistilBertForQuestionAnswering,pass,0
+
+
+
 DistillGPT2,pass,2


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,0



+ElectraForQuestionAnswering,pass,0
+
+
+
 GPT2ForSequenceClassification,pass,0


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,0



+LayoutLMForSequenceClassification,pass,0
+
+
+
 M2M100ForConditionalGeneration,pass,0


@ -58,6 +98,10 @@ MBartForCausalLM,pass,0



+MBartForConditionalGeneration,pass,0
+
+
+
 MT5ForConditionalGeneration,pass,0


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,0



+MegatronBertForQuestionAnswering,pass,0
+
+
+
 MobileBertForMaskedLM,pass,0



+MobileBertForQuestionAnswering,pass,0
+
+
+
 OPTForCausalLM,pass,0


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,0



+PLBartForConditionalGeneration,pass,0
+
+
+
 PegasusForCausalLM,pass,0



+PegasusForConditionalGeneration,pass,0
+
+
+
 RobertaForCausalLM,pass,0



+RobertaForQuestionAnswering,pass,0
+
+
+
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_timm_inference.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,0



+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+
+
+
 deit_base_distilled_patch16_224,pass,0



+dla102,pass,0
+
+
+
 dm_nfnet_f0,pass,0



+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
 ghostnet_100,pass,0



+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
 inception_v3,pass,0



+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
 mobilenetv2_100,pass,0


@ -42,16 +146,100 @@ nfnet_l0,pass,0



+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
 repvgg_a2,pass,0



+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
 swin_base_patch4_window7_224,pass,0



+swsl_resnext101_32x16d,pass,0
+
+
+
 tf_efficientnet_b0,pass,0



+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
 visformer_small,pass,0
+
+
+
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,0



+AlbertForQuestionAnswering,pass,0
+
+
+
 AllenaiLongformerBase,pass,4


@ -14,22 +18,50 @@ BartForCausalLM,pass,0



+BartForConditionalGeneration,pass,0
+
+
+
 BertForMaskedLM,pass,0



+BertForQuestionAnswering,pass,0
+
+
+
 BlenderbotForCausalLM,pass_due_to_skip,0



+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,pass,0
+
+
+
 DistilBertForMaskedLM,pass,0



+DistilBertForQuestionAnswering,pass,0
+
+
+
 DistillGPT2,pass,2


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,0



+ElectraForQuestionAnswering,pass,0
+
+
+
 GPT2ForSequenceClassification,pass,0


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,0



+LayoutLMForSequenceClassification,pass,0
+
+
+
 M2M100ForConditionalGeneration,pass,0


@ -58,6 +98,10 @@ MBartForCausalLM,pass,0



+MBartForConditionalGeneration,pass,0
+
+
+
 MT5ForConditionalGeneration,pass,0


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,0



+MegatronBertForQuestionAnswering,pass,0
+
+
+
 MobileBertForMaskedLM,pass,0



+MobileBertForQuestionAnswering,pass,0
+
+
+
 OPTForCausalLM,pass,0


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,0



+PLBartForConditionalGeneration,pass,0
+
+
+
 PegasusForCausalLM,pass,0



+PegasusForConditionalGeneration,pass,0
+
+
+
 RobertaForCausalLM,pass,0



+RobertaForQuestionAnswering,pass,0
+
+
+
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,4



+AlbertForQuestionAnswering,pass,5
+
+
+
 AllenaiLongformerBase,pass,9


@ -14,22 +18,50 @@ BartForCausalLM,pass,6



+BartForConditionalGeneration,pass,8
+
+
+
 BertForMaskedLM,pass,5



+BertForQuestionAnswering,pass,5
+
+
+
 BlenderbotForCausalLM,eager_fail_to_run,0



+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
+
+
+
 DistilBertForMaskedLM,pass,5



+DistilBertForQuestionAnswering,pass,5
+
+
+
 DistillGPT2,pass,7


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,4



+ElectraForQuestionAnswering,pass,5
+
+
+
 GPT2ForSequenceClassification,pass,6


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,5



+LayoutLMForSequenceClassification,pass,6
+
+
+
 M2M100ForConditionalGeneration,pass,4


@ -58,6 +98,10 @@ MBartForCausalLM,pass,6



+MBartForConditionalGeneration,pass,8
+
+
+
 MT5ForConditionalGeneration,pass,5


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,5



+MegatronBertForQuestionAnswering,pass,5
+
+
+
 MobileBertForMaskedLM,pass,3



+MobileBertForQuestionAnswering,pass,3
+
+
+
 OPTForCausalLM,pass,8


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,6



+PLBartForConditionalGeneration,pass,8
+
+
+
 PegasusForCausalLM,pass,6



+PegasusForConditionalGeneration,pass,7
+
+
+
 RobertaForCausalLM,pass,5



+RobertaForQuestionAnswering,pass,5
+
+
+
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_inference.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,0



+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+
+
+
 deit_base_distilled_patch16_224,pass,0



+dla102,pass,0
+
+
+
 dm_nfnet_f0,pass,0



+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
 ghostnet_100,pass,0



+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
 inception_v3,pass,0



+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,fail_to_run,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
 mobilenetv2_100,pass,0


@ -42,16 +146,100 @@ nfnet_l0,pass,0



+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
 repvgg_a2,pass,0



+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
 swin_base_patch4_window7_224,pass,0



+swsl_resnext101_32x16d,pass,0
+
+
+
 tf_efficientnet_b0,pass,0



+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
 visformer_small,pass,0
+
+
+
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,7



+botnet26t_256,pass,6
+
+
+
+cait_m36_384,eager_fail_to_run,0
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+
+
+
 deit_base_distilled_patch16_224,pass,7



+dla102,pass,7
+
+
+
 dm_nfnet_f0,pass,6



+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
 ghostnet_100,pass,6



+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
 inception_v3,pass,6



+jx_nest_base,pass,7
+
+
+
+lcnet_050,pass,6
+
+
+
+levit_128,pass,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
 mobilenetv2_100,pass,7


@ -42,16 +146,100 @@ nfnet_l0,pass,7



+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
 repvgg_a2,pass,7



+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+
+
+
 swin_base_patch4_window7_224,pass,7



+swsl_resnext101_32x16d,pass,6
+
+
+
 tf_efficientnet_b0,pass,6



+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
 visformer_small,pass,7
+
+
+
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,0



+AlbertForQuestionAnswering,pass,0
+
+
+
 AllenaiLongformerBase,pass,4


@ -14,22 +18,50 @@ BartForCausalLM,pass,0



+BartForConditionalGeneration,pass,0
+
+
+
 BertForMaskedLM,pass,0



+BertForQuestionAnswering,pass,0
+
+
+
 BlenderbotForCausalLM,pass_due_to_skip,0



+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,pass,0
+
+
+
 DistilBertForMaskedLM,pass,0



+DistilBertForQuestionAnswering,pass,0
+
+
+
 DistillGPT2,pass,2


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,0



+ElectraForQuestionAnswering,pass,0
+
+
+
 GPT2ForSequenceClassification,pass,0


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,0



+LayoutLMForSequenceClassification,pass,0
+
+
+
 M2M100ForConditionalGeneration,pass,0


@ -58,6 +98,10 @@ MBartForCausalLM,pass,0



+MBartForConditionalGeneration,pass,0
+
+
+
 MT5ForConditionalGeneration,pass,0


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,0



+MegatronBertForQuestionAnswering,pass,0
+
+
+
 MobileBertForMaskedLM,pass,0



+MobileBertForQuestionAnswering,pass,0
+
+
+
 OPTForCausalLM,pass,0


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,0



+PLBartForConditionalGeneration,pass,0
+
+
+
 PegasusForCausalLM,pass,0



+PegasusForConditionalGeneration,pass,0
+
+
+
 RobertaForCausalLM,pass,0



+RobertaForQuestionAnswering,pass,0
+
+
+
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,4



+AlbertForQuestionAnswering,pass,5
+
+
+
 AllenaiLongformerBase,pass,9


@ -14,22 +18,50 @@ BartForCausalLM,pass,6



+BartForConditionalGeneration,pass,8
+
+
+
 BertForMaskedLM,pass,5



+BertForQuestionAnswering,pass,5
+
+
+
 BlenderbotForCausalLM,eager_fail_to_run,0



+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
+
+
+
 DistilBertForMaskedLM,pass,5



+DistilBertForQuestionAnswering,pass,5
+
+
+
 DistillGPT2,pass,7


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,4



+ElectraForQuestionAnswering,pass,5
+
+
+
 GPT2ForSequenceClassification,pass,6


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,5



+LayoutLMForSequenceClassification,pass,6
+
+
+
 M2M100ForConditionalGeneration,pass,4


@ -58,6 +98,10 @@ MBartForCausalLM,pass,6



+MBartForConditionalGeneration,pass,8
+
+
+
 MT5ForConditionalGeneration,pass,5


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,5



+MegatronBertForQuestionAnswering,pass,5
+
+
+
 MobileBertForMaskedLM,pass,3



+MobileBertForQuestionAnswering,pass,3
+
+
+
 OPTForCausalLM,pass,8


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,6



+PLBartForConditionalGeneration,pass,8
+
+
+
 PegasusForCausalLM,pass,6



+PegasusForConditionalGeneration,pass,7
+
+
+
 RobertaForCausalLM,pass,5



+RobertaForQuestionAnswering,pass,5
+
+
+
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_inference.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,0



+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+
+
+
 deit_base_distilled_patch16_224,pass,0



+dla102,pass,0
+
+
+
 dm_nfnet_f0,pass,0



+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
 ghostnet_100,pass,0



+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
 inception_v3,pass,0



+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
 mobilenetv2_100,pass,0


@ -42,16 +146,100 @@ nfnet_l0,pass,0



+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
 repvgg_a2,pass,0



+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
 swin_base_patch4_window7_224,pass,0



+swsl_resnext101_32x16d,pass,0
+
+
+
 tf_efficientnet_b0,pass,0



+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
 visformer_small,pass,0
+
+
+
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,7



+botnet26t_256,pass,6
+
+
+
+cait_m36_384,eager_fail_to_run,0
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+
+
+
 deit_base_distilled_patch16_224,pass,7



+dla102,pass,7
+
+
+
 dm_nfnet_f0,pass,6



+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
 ghostnet_100,pass,6



+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
 inception_v3,pass,6



+jx_nest_base,pass,7
+
+
+
+lcnet_050,pass,6
+
+
+
+levit_128,pass,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
 mobilenetv2_100,pass,7


@ -42,16 +146,100 @@ nfnet_l0,pass,7



+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
 repvgg_a2,pass,7



+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+
+
+
 swin_base_patch4_window7_224,pass,7



+swsl_resnext101_32x16d,pass,6
+
+
+
 tf_efficientnet_b0,pass,6



+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
 visformer_small,pass,7
+
+
+
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,0



+AlbertForQuestionAnswering,pass,0
+
+
+
 AllenaiLongformerBase,pass,4


@ -14,22 +18,50 @@ BartForCausalLM,pass,0



+BartForConditionalGeneration,pass,0
+
+
+
 BertForMaskedLM,pass,0



+BertForQuestionAnswering,pass,0
+
+
+
 BlenderbotForCausalLM,pass_due_to_skip,0



+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,pass,0
+
+
+
 DistilBertForMaskedLM,pass,0



+DistilBertForQuestionAnswering,pass,0
+
+
+
 DistillGPT2,pass,2


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,0



+ElectraForQuestionAnswering,pass,0
+
+
+
 GPT2ForSequenceClassification,pass,0


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,0



+LayoutLMForSequenceClassification,pass,0
+
+
+
 M2M100ForConditionalGeneration,pass,0


@ -58,6 +98,10 @@ MBartForCausalLM,pass,0



+MBartForConditionalGeneration,pass,0
+
+
+
 MT5ForConditionalGeneration,pass,0


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,0



+MegatronBertForQuestionAnswering,pass,0
+
+
+
 MobileBertForMaskedLM,pass,0



+MobileBertForQuestionAnswering,pass,0
+
+
+
 OPTForCausalLM,pass,0


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,0



+PLBartForConditionalGeneration,pass,0
+
+
+
 PegasusForCausalLM,pass,0



+PegasusForConditionalGeneration,pass,0
+
+
+
 RobertaForCausalLM,pass,0



+RobertaForQuestionAnswering,pass,0
+
+
+
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
@ -6,6 +6,10 @@ AlbertForMaskedLM,pass,4



+AlbertForQuestionAnswering,pass,5
+
+
+
 AllenaiLongformerBase,pass,9


@ -14,22 +18,50 @@ BartForCausalLM,pass,6



+BartForConditionalGeneration,pass,8
+
+
+
 BertForMaskedLM,pass,5



+BertForQuestionAnswering,pass,5
+
+
+
 BlenderbotForCausalLM,eager_fail_to_run,0



+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
 DebertaV2ForMaskedLM,pass_due_to_skip,0



+DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
+
+
+
 DistilBertForMaskedLM,pass,5



+DistilBertForQuestionAnswering,pass,5
+
+
+
 DistillGPT2,pass,7


@ -38,6 +70,10 @@ ElectraForCausalLM,pass,4



+ElectraForQuestionAnswering,pass,5
+
+
+
 GPT2ForSequenceClassification,pass,6


@ -50,6 +86,10 @@ LayoutLMForMaskedLM,pass,5



+LayoutLMForSequenceClassification,pass,6
+
+
+
 M2M100ForConditionalGeneration,pass,4


@ -58,6 +98,10 @@ MBartForCausalLM,pass,6



+MBartForConditionalGeneration,pass,8
+
+
+
 MT5ForConditionalGeneration,pass,5


@ -66,10 +110,18 @@ MegatronBertForCausalLM,pass,5



+MegatronBertForQuestionAnswering,pass,5
+
+
+
 MobileBertForMaskedLM,pass,3



+MobileBertForQuestionAnswering,pass,3
+
+
+
 OPTForCausalLM,pass,8


@ -78,14 +130,26 @@ PLBartForCausalLM,pass,6



+PLBartForConditionalGeneration,pass,8
+
+
+
 PegasusForCausalLM,pass,6



+PegasusForConditionalGeneration,pass,7
+
+
+
 RobertaForCausalLM,pass,5



+RobertaForQuestionAnswering,pass,5
+
+
+
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_inference.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,0



+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+
+
+
 deit_base_distilled_patch16_224,pass,0



+dla102,pass,0
+
+
+
 dm_nfnet_f0,pass,0



+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
 ghostnet_100,pass,0



+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
 inception_v3,pass,0



+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,fail_to_run,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
 mobilenetv2_100,pass,0


@ -42,16 +146,100 @@ nfnet_l0,pass,0



+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
 repvgg_a2,pass,0



+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
 swin_base_patch4_window7_224,pass,0



+swsl_resnext101_32x16d,pass,0
+
+
+
 tf_efficientnet_b0,pass,0



+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
 visformer_small,pass,0
+
+
+
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv
@ -10,22 +10,126 @@ beit_base_patch16_224,pass,7



+botnet26t_256,pass,6
+
+
+
+cait_m36_384,eager_fail_to_run,0
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+
+
+
 deit_base_distilled_patch16_224,pass,7



+dla102,pass,7
+
+
+
 dm_nfnet_f0,pass,6



+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
 ghostnet_100,pass,6



+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
 inception_v3,pass,6



+jx_nest_base,pass,7
+
+
+
+lcnet_050,pass,6
+
+
+
+levit_128,pass,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
 mobilenetv2_100,pass,7


@ -42,16 +146,100 @@ nfnet_l0,pass,7



+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
 repvgg_a2,pass,7



+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+
+
+
 swin_base_patch4_window7_224,pass,7



+swsl_resnext101_32x16d,pass,6
+
+
+
 tf_efficientnet_b0,pass,6



+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
 visformer_small,pass,7
+
+
+
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@ -378,7 +378,7 @@ vgg16,pass,0



-vision_maskrcnn,pass,20
+vision_maskrcnn,pass,18



--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@ -286,7 +286,7 @@ vgg16,pass,6



-vision_maskrcnn,pass,39
+vision_maskrcnn,pass,37



--- a/Show More
+++ b/Show More